From cbeb131d891e654d2a653dbf01e63b1e0be6b820 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Thu, 24 Oct 2024 06:56:10 +0200 Subject: [PATCH 001/233] [Transformations] Add SliceScatter-15 decomposition transformation (#27136) ### Details: - *Add SliceScatter-15 decomposition transformation for unsupported plugins* - *...* ### Tickets: - *CVS-151158* --------- Co-authored-by: Michal Lukaszewski --- .../op_conversions/convert_slicescatter.hpp | 22 +++ .../common_optimizations.cpp | 2 + .../op_conversions/convert_slicescatter.cpp | 77 ++++++++++ ...onvert_slicescatter_decomposition_test.cpp | 145 ++++++++++++++++++ .../transformation_pipeline.cpp | 2 + .../backend/ops/scatter_nd_update.cpp | 12 +- 6 files changed, 254 insertions(+), 6 deletions(-) create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_slicescatter.hpp create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_slicescatter.cpp create mode 100644 src/common/transformations/tests/op_conversions/convert_slicescatter_decomposition_test.cpp diff --git a/src/common/transformations/include/transformations/op_conversions/convert_slicescatter.hpp b/src/common/transformations/include/transformations/op_conversions/convert_slicescatter.hpp new file mode 100644 index 00000000000000..020b4e236fcac5 --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_slicescatter.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertSliceScatter; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertSliceScatter : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertSliceScatter", "0"); + ConvertSliceScatter(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp index 500d003bd4642e..9d46b583a828f2 100644 --- a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp @@ -94,6 +94,7 @@ #include "transformations/op_conversions/convert_scatter_elements_update12_downgrade.hpp" #include "transformations/op_conversions/convert_scatter_nd_update15_downgrade.hpp" #include "transformations/op_conversions/convert_slice_to_strided_slice.hpp" +#include "transformations/op_conversions/convert_slicescatter.hpp" #include "transformations/op_conversions/convert_softmax_downgrade.hpp" #include "transformations/op_conversions/convert_softmax_upgrade.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" @@ -233,6 +234,7 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr(); ADD_MATCHER(fq_fusions, FakeQuantizeMulFusion) diff --git a/src/common/transformations/src/transformations/op_conversions/convert_slicescatter.cpp b/src/common/transformations/src/transformations/op_conversions/convert_slicescatter.cpp new file mode 100644 index 00000000000000..eedde963461a6b --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_slicescatter.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_slicescatter.hpp" + +#include +#include + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/range.hpp" +#include "openvino/op/reduce_prod.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scatter_nd_update.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/slice_scatter.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + +ov::pass::ConvertSliceScatter::ConvertSliceScatter() { + MATCHER_SCOPE(ConvertSliceScatter); + + const auto& slicescatter = pattern::wrap_type(); + + const matcher_pass_callback callback = [this](pattern::Matcher& m) { + const auto& slice_node = ov::as_type_ptr(m.get_match_root()); + if (!slice_node || transformation_callback(slice_node)) { + return false; + } + NodeRegistry node_registry; + const auto& const_0 = node_registry.make(ov::element::i64, Shape{}, 0); + const auto& const_1 = node_registry.make(ov::element::i64, Shape{}, 1); + const auto& const_1d_neg_1 = + node_registry.make(ov::element::i64, Shape{1}, std::vector{-1}); + const auto& const_scatter_indices_shape = + node_registry.make(ov::element::i64, Shape{2}, std::vector{-1, 1}); + const auto& data_shape = node_registry.make(slice_node->input_value(0), ov::element::i64); + const auto& num_elements_data = node_registry.make(data_shape, const_0, false); + const auto& data_indices_flatten = + node_registry.make(const_0, num_elements_data, const_1, ov::element::i64); + const auto& full_data_indices = + node_registry.make(data_indices_flatten, data_shape, false); + std::shared_ptr slice_indices; + if (slice_node->get_input_size() == 5) { + slice_indices = node_registry.make(full_data_indices, + slice_node->input_value(2), + slice_node->input_value(3), + slice_node->input_value(4)); + } else { + slice_indices = node_registry.make(full_data_indices, + slice_node->input_value(2), + slice_node->input_value(3), + slice_node->input_value(4), + slice_node->input_value(5)); + } + const auto& slice_indices_flatten = + node_registry.make(slice_indices, const_scatter_indices_shape, false); + const auto& updates_flatten = + node_registry.make(slice_node->input_value(1), const_1d_neg_1, false); + const auto& data_flatten = + node_registry.make(slice_node->input_value(0), const_1d_neg_1, false); + const auto& output_flatten = + node_registry.make(data_flatten, slice_indices_flatten, updates_flatten); + const auto& output = node_registry.make(output_flatten, data_shape, false); + + output->set_friendly_name(slice_node->get_friendly_name()); + copy_runtime_info(slice_node, node_registry.get()); + replace_node(slice_node, output); + + return true; + }; + + const auto& m = std::make_shared(slicescatter, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/op_conversions/convert_slicescatter_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/convert_slicescatter_decomposition_test.cpp new file mode 100644 index 00000000000000..c3548128403624 --- /dev/null +++ b/src/common/transformations/tests/op_conversions/convert_slicescatter_decomposition_test.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/opsets/opset15.hpp" +#include "openvino/opsets/opset8.hpp" +#include "openvino/pass/manager.hpp" +#include "transformations/op_conversions/convert_slicescatter.hpp" +#include "transformations/utils/utils.hpp" +namespace { +class ConvertSliceScatterTest : public TransformationTestsF, public testing::WithParamInterface { +private: + void SetUp() override { + TransformationTestsF::SetUp(); + const auto& inputs = GetParam(); + manager.register_pass(); + model = create_v15_model(inputs); + model_ref = create_decomposed_model(inputs); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); + } + +protected: + std::shared_ptr create_v15_model(ov::NodeVector inputs) { + const auto& data = inputs.at(0); + const auto& updates = inputs.at(1); + const auto& start = inputs.at(2); + const auto& stop = inputs.at(3); + const auto& step = inputs.at(4); + ov::ParameterVector params{}; + for (const auto& inp : inputs) { + const auto& param = ov::as_type_ptr(inp); + if (param) { + params.push_back(param); + } + } + std::shared_ptr slicescatter; + if (inputs.size() == 5) { + slicescatter = std::make_shared(data, updates, start, stop, step); + } else { + slicescatter = std::make_shared(data, updates, start, stop, step, inputs.at(5)); + } + slicescatter->set_friendly_name("slicescatter15"); + return std::make_shared(slicescatter->outputs(), params); + } + + std::shared_ptr create_decomposed_model(ov::NodeVector inputs) { + const auto& data = inputs.at(0); + const auto& updates = inputs.at(1); + const auto& start = inputs.at(2); + const auto& stop = inputs.at(3); + const auto& step = inputs.at(4); + ov::ParameterVector params{}; + for (const auto& inp : inputs) { + const auto& param = ov::as_type_ptr(inp); + if (param) { + params.push_back(param); + } + } + const auto& const_0 = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + const auto& const_1 = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); + const auto& const_1d_neg_1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + const auto& const_scatter_indices_shape = ov::op::v0::Constant::create(ov::element::i64, {2}, {-1, 1}); + const auto& data_shape = std::make_shared(data, ov::element::i64); + const auto& num_elements_data = std::make_shared(data_shape, const_0, false); + const auto& data_indices_flatten = + std::make_shared(const_0, num_elements_data, const_1, ov::element::i64); + const auto& full_data_indices = std::make_shared(data_indices_flatten, data_shape, false); + std::shared_ptr slice_indices; + if (inputs.size() == 5) { + slice_indices = std::make_shared(full_data_indices, start, stop, step); + } else { + slice_indices = std::make_shared(full_data_indices, start, stop, step, inputs.at(5)); + } + const auto& slice_indices_flatten = + std::make_shared(slice_indices, const_scatter_indices_shape, false); + const auto& updates_flatten = std::make_shared(updates, const_1d_neg_1, false); + const auto& data_flatten = std::make_shared(data, const_1d_neg_1, false); + const auto& output_flatten = + std::make_shared(data_flatten, slice_indices_flatten, updates_flatten); + const auto& slicescatter = std::make_shared(output_flatten, data_shape, false); + slicescatter->set_friendly_name("slicescatter15"); + return std::make_shared(slicescatter->outputs(), params); + } +}; + +INSTANTIATE_TEST_SUITE_P( + ConvertSliceScatterDecomposition, + ConvertSliceScatterTest, + testing::Values( + ov::NodeVector{ + std::make_shared(ov::element::f32, ov::Shape{256, 10, 15}), + std::make_shared(ov::element::f32, ov::Shape{4, 7, 2}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, -15, 25}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {9, 7, -3}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, 1, -1}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {0, 1, -1}), + }, + ov::NodeVector{ + std::make_shared(ov::element::f32, ov::Shape{256, 10, 15}), + std::make_shared(ov::element::f32, ov::Shape{4, 7, 2}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, -15, 25}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {9, 7, -3}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, 1, -1}), + }, + ov::NodeVector{ + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, -15, 25}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {9, 7, -3}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, 1, -1}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {0, 1, -1}), + }, + ov::NodeVector{ + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, -15, 25}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {9, 7, -3}), + ov::op::v0::Constant::create(ov::element::i32, {3}, {2, 1, -1}), + }, + ov::NodeVector{ + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + }, + ov::NodeVector{ + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + std::make_shared(ov::element::i32, ov::PartialShape::dynamic()), + })); +TEST_P(ConvertSliceScatterTest, CompareFunctions) {} + +} // namespace diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index e98045bd32dbbf..c12782831ef5c3 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -63,6 +63,7 @@ #include "transformations/op_conversions/convert_scatter_nd_update15_downgrade.hpp" #include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp" #include "transformations/op_conversions/convert_shuffle_channels3.hpp" +#include "transformations/op_conversions/convert_slicescatter.hpp" #include "transformations/op_conversions/convert_slice_to_strided_slice.hpp" #include "transformations/op_conversions/convert_space_to_batch.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" @@ -656,6 +657,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_DISABLE_PASS_COMMON(manager, ov::pass::HSwishDecomposition); CPU_DISABLE_PASS_COMMON(manager, ov::pass::MatMulConstTransposesExtraction); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertScatterNDUpdate15ToScatterNDUpdate3); + CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertSliceScatter); CPU_DISABLE_PASS_X64(manager, ov::pass::HSigmoidDecomposition); CPU_DISABLE_PASS_X64(manager, ov::pass::ReduceL1Decomposition); diff --git a/src/plugins/template/backend/ops/scatter_nd_update.cpp b/src/plugins/template/backend/ops/scatter_nd_update.cpp index ed7646a8cb888e..8081651410a332 100644 --- a/src/plugins/template/backend/ops/scatter_nd_update.cpp +++ b/src/plugins/template/backend/ops/scatter_nd_update.cpp @@ -17,17 +17,17 @@ bool evaluate(const std::shared_ptr& op, inputs[1].data(), inputs[2].data(), outputs[0].data(), - op->get_input_shape(0), - op->get_input_shape(1), - op->get_input_shape(2)); + inputs[0].get_shape(), + inputs[1].get_shape(), + inputs[2].get_shape()); } else if (idxType == ov::element::i64) { ov::reference::scatterNdUpdate(inputs[0].data(), inputs[1].data(), inputs[2].data(), outputs[0].data(), - op->get_input_shape(0), - op->get_input_shape(1), - op->get_input_shape(2)); + inputs[0].get_shape(), + inputs[1].get_shape(), + inputs[2].get_shape()); } else { OPENVINO_THROW("ScatterNDUpdate layer support only i32 and i64 'indices' input precision!"); } From 60a8efd0c1e9cf9733f7c26059c542df11f04727 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Thu, 24 Oct 2024 08:59:53 +0400 Subject: [PATCH 002/233] [CPU][OMP] Fix for Interpolate node (#27184) ### Details: - *Fix threads number* - *...* ### Tickets: - *152606* --- src/core/include/openvino/core/parallel.hpp | 24 +++++++++---- .../intel_cpu/src/nodes/interpolate.cpp | 36 +++++++++++-------- src/plugins/intel_cpu/src/nodes/interpolate.h | 1 + 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/core/include/openvino/core/parallel.hpp b/src/core/include/openvino/core/parallel.hpp index 6d3a243c95e7dc..a231c6833f9d84 100644 --- a/src/core/include/openvino/core/parallel.hpp +++ b/src/core/include/openvino/core/parallel.hpp @@ -461,8 +461,10 @@ void parallel_for(const T0& D0, const F& func) { for_1d(ithr, nthr, D0, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); + { for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); } #elif OV_THREAD == OV_THREAD_SEQ for_1d(0, 1, D0, func); #endif @@ -509,8 +511,10 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) { for_2d(ithr, nthr, D0, D1, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); + { for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); } #elif OV_THREAD == OV_THREAD_SEQ for_2d(0, 1, D0, D1, func); #endif @@ -575,8 +579,10 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { for_3d(ithr, nthr, D0, D1, D2, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); + { for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); } #elif OV_THREAD == OV_THREAD_SEQ for_3d(0, 1, D0, D1, D2, func); #endif @@ -645,8 +651,10 @@ void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_4d(ithr, nthr, D0, D1, D2, D3, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); + { for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); } #elif OV_THREAD == OV_THREAD_SEQ for_4d(0, 1, D0, D1, D2, D3, func); #endif @@ -703,8 +711,10 @@ void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); + { for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); } #elif OV_THREAD == OV_THREAD_SEQ for_5d(0, 1, D0, D1, D2, D3, D4, func); #endif @@ -763,8 +773,10 @@ void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); + { for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); } #elif OV_THREAD == OV_THREAD_SEQ for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); #endif diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index ee6afa33827861..7eed5c1df9789b 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -2828,24 +2828,27 @@ void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_ bool xPass = IW != OW; bool yPass = IH != OH; - parallel_for(B, [&](size_t b) { + auto b_loop = [&](size_t b) { auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr_ + (IW * IH * C * b) * srcDataSize; if (xPass && yPass) { - size_t threadsNum = parallel_get_num_threads(); - size_t parallelNum = B; + size_t parallel_num = B; // IH * OW * C buf needed size_t buffer_size = static_cast(OW * IH * C); - if (parallelNum < threadsNum) { + if (parallel_num < m_threads_num) { arg.src_ptr[1] = static_cast(&pillow_working_buf[b * buffer_size * srcDataSize]); } else { - size_t threadsIdx = parallel_get_thread_num(); - arg.src_ptr[1] = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); + size_t threads_idx = parallel_get_thread_num(); + arg.src_ptr[1] = static_cast(&pillow_working_buf[threads_idx * buffer_size * srcDataSize]); } } arg.dst = out_ptr_ + (OW * OH * C * b) * dstDataSize; arg.weight_ptr[0] = reinterpret_cast(&auxTable[2]); (*interpolateKernel)(&arg); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, B, b_loop); }); } @@ -3706,16 +3709,15 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint // | | // | | // ---- - parallel_for2d(B, C, [&](size_t b, size_t c) { + auto bc_loop = [&](size_t b, size_t c) { const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; uint8_t *xpass_out_ptr_nc = nullptr; const uint8_t *ypass_in_ptr_nc = nullptr; if (xPass && yPass) { - size_t threadsNum = parallel_get_num_threads(); - size_t parallelNum = B * C; + size_t parallel_num = B * C; // IH * OW buf needed - if (parallelNum < threadsNum) { + if (parallel_num < m_threads_num) { xpass_out_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); ypass_in_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); } else { @@ -3770,6 +3772,10 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint } } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, B, C, bc_loop); }); } @@ -3777,16 +3783,16 @@ void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(Interpolate if (srcDimPad5d[3] == dstDim5d[3] || srcDimPad5d[4] == dstDim5d[4]) return; size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW - size_t threadsNum = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); if (layout == InterpolateLayoutType::planar) { // B and C execute in parallel, need separate buf - size_t parallelNum = srcDimPad5d[0] * srcDimPad5d[1]; - bufSize *= std::min(threadsNum, parallelNum); + size_t parallel_num = srcDimPad5d[0] * srcDimPad5d[1]; + bufSize *= std::min(m_threads_num, parallel_num); } else { bufSize *= srcDimPad5d[1]; // *C // B execute in parallel, need separate buf - size_t parallelNum = srcDimPad5d[0]; - bufSize *= std::min(threadsNum, parallelNum); + size_t parallel_num = srcDimPad5d[0]; + bufSize *= std::min(m_threads_num, parallel_num); } pillow_working_buf.resize(bufSize); } diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.h b/src/plugins/intel_cpu/src/nodes/interpolate.h index 11f0e3104e5085..a43b354aa0306a 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.h +++ b/src/plugins/intel_cpu/src/nodes/interpolate.h @@ -148,6 +148,7 @@ class Interpolate : public Node { size_t dataRank; std::vector auxTable; std::vector pillow_working_buf; + size_t m_threads_num = 0lu; }; std::shared_ptr execPtr = nullptr; From f9406ad17c8ae5f6985d8baefd5e726a5c913898 Mon Sep 17 00:00:00 2001 From: Xiuchuan Zhai Date: Thu, 24 Oct 2024 13:25:38 +0800 Subject: [PATCH 003/233] [CPU] fix strided_slice with new axis (#27103) ### Details: - *fix new axis for stridedslice node* - *...* ### Tickets: - *152198* --- .../shape_inference/custom/strided_slice.cpp | 49 +++++++++++-------- .../custom_shape_infer/strided_slice.cpp | 45 ++++++++++++----- 2 files changed, 60 insertions(+), 34 deletions(-) diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/strided_slice.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/strided_slice.cpp index 46ae7e2b959560..bb280a4356074e 100644 --- a/src/plugins/intel_cpu/src/shape_inference/custom/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/custom/strided_slice.cpp @@ -38,29 +38,36 @@ Result StridedSliceShapeInfer::infer( auto endPtr = data_dependency.at(END_ID)->getDataAs(); auto stridePtr = data_dependency.at(STRIDE_ID)->getDataAs(); - for (size_t i = 0, new_idx = 0; i < shapeIn.size(); ++i) { - if (m_new_axis_mask_set.count(i)) { - // deal with new_axis_mask - m_outputShape[new_idx] = 1; - m_outputShape[new_idx+1] = shapeIn[i]; - new_idx+=2; - } else if (!m_shrink_axis_mask_set.count(i)) { - // deal with begin_mask and end_mask - if ((i >= shapeBegin[0]) || (shapeIn[i] == 0)) { - m_outputShape[new_idx] = shapeIn[i]; + const auto begin_size = shapeBegin[0]; + + auto gen_new_sliced_value = [&](size_t cur_idx, size_t in_idx) -> size_t { + if ((cur_idx >= begin_size) || (shapeIn[in_idx] == 0)) { + return shapeIn[in_idx]; + } else { + int32_t begin = 0, end = 0; + if (stridePtr[cur_idx] < 0) { + begin = m_begin_mask_set.count(cur_idx) ? shapeIn[in_idx] : beginPtr[cur_idx]; + end = m_end_mask_set.count(cur_idx) ? (-1 - shapeIn[in_idx]) : endPtr[cur_idx]; } else { - int32_t begin = 0; - int32_t end = 0; - if (stridePtr[i] < 0) { - begin = m_begin_mask_set.count(i) ? shapeIn[i] : beginPtr[i]; - end = m_end_mask_set.count(i) ? (-1 - shapeIn[i]) : endPtr[i]; - } else { - begin = m_begin_mask_set.count(i) ? 0 : beginPtr[i]; - end = m_end_mask_set.count(i) ? shapeIn[i] : endPtr[i]; - } - m_outputShape[new_idx] = ov::op::slice::get_sliced_value(shapeIn[i], begin, end, stridePtr[i]); + begin = m_begin_mask_set.count(cur_idx) ? 0 : beginPtr[cur_idx]; + end = m_end_mask_set.count(cur_idx) ? shapeIn[in_idx] : endPtr[cur_idx]; } - new_idx += 1; + return ov::op::slice::get_sliced_value(shapeIn[in_idx], begin, end, stridePtr[cur_idx]); + } + }; + + for (size_t in_idx = 0, out_idx = 0; in_idx < shapeIn.size(); ++in_idx) { + if (m_new_axis_mask_set.count(in_idx)) { + // deal with new_axis_mask + m_outputShape[out_idx] = 1; + out_idx++; + // deal with current axis + m_outputShape[out_idx] = gen_new_sliced_value(out_idx, in_idx); + out_idx++; + } else if (!m_shrink_axis_mask_set.count(in_idx)) { + // deal with begin_mask and end_mask + m_outputShape[out_idx] = gen_new_sliced_value(in_idx, in_idx); + out_idx++; } } return {{m_outputShape}, ShapeInferStatus::success}; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/strided_slice.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/strided_slice.cpp index 48fb31b9adfbb6..8b5ea927927e94 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/strided_slice.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/strided_slice.cpp @@ -19,6 +19,8 @@ using StridedSliceParams = std::tuple>, // data{begin,end,stride} std::vector, // begin_mask std::vector, // end_mask + std::vector, // new_axis_mask + std::vector, // shrink_axis_mask StaticShape // Expected shape >; @@ -35,8 +37,10 @@ class StridedSliceCpuShapeInferenceTest : public unit_test::OpCpuShapeInference std::vector> tmp_data; std::vector tmp_begin_mask; std::vector tmp_end_mask; + std::vector tmp_new_axis_mask; + std::vector tmp_shrink_axis_mask; StaticShape tmp_exp_shape; - std::tie(tmp_input_shapes, tmp_data, tmp_begin_mask, tmp_end_mask, tmp_exp_shape) = obj.param; + std::tie(tmp_input_shapes, tmp_data, tmp_begin_mask, tmp_end_mask, tmp_new_axis_mask, tmp_shrink_axis_mask, tmp_exp_shape) = obj.param; std::ostringstream result; result << "IS" << ov::test::utils::vec2str(tmp_input_shapes) << "_"; result << "begin" << ov::test::utils::vec2str(tmp_data[BEGIN]) << "_"; @@ -44,13 +48,15 @@ class StridedSliceCpuShapeInferenceTest : public unit_test::OpCpuShapeInference result << "stride" << ov::test::utils::vec2str(tmp_data[STRIDE]) << "_"; result << "begin_mask" << ov::test::utils::vec2str(tmp_begin_mask) << "_"; result << "end_mask" << ov::test::utils::vec2str(tmp_end_mask) << "_"; + result << "new_axis_mask" << ov::test::utils::vec2str(tmp_new_axis_mask) << "_"; + result << "shrink_axis_mask" << ov::test::utils::vec2str(tmp_shrink_axis_mask) << "_"; result << "exp_shape(" << tmp_exp_shape << ")"; return result.str(); } protected: void SetUp() override { - std::tie(input_shapes, data, begin_mask, end_mask, exp_shape) = GetParam(); + std::tie(input_shapes, data, begin_mask, end_mask, new_axis_mask, shrink_axis_mask, exp_shape) = GetParam(); output_shapes = unit_test::ShapeVector(0); output_shapes.push_back(exp_shape); ASSERT_EQ(input_shapes.size(), 4); @@ -59,6 +65,8 @@ class StridedSliceCpuShapeInferenceTest : public unit_test::OpCpuShapeInference std::vector> data; std::vector begin_mask; std::vector end_mask; + std::vector new_axis_mask; + std::vector shrink_axis_mask; std::shared_ptr arg; }; @@ -66,9 +74,10 @@ TEST_P(StridedSliceCpuShapeInferenceTest , shape_inference_empty_const_map) { const auto begin = op::v0::Constant::create(element::i32, input_shapes[1].get_shape(), data[BEGIN]); const auto end = op::v0::Constant::create(element::i32, input_shapes[2].get_shape(), data[END]); const auto stride = op::v0::Constant::create(element::i32, input_shapes[3].get_shape(), data[STRIDE]); - const auto op = make_op(arg, begin, end, stride, begin_mask, end_mask); + const auto op = make_op(arg, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_axis_mask); // implementation depends on some output information of the op - op->set_output_type(0, element::i32, {-1, -1, -1}); + auto output_axis = output_shapes[0].to_shape().size(); + op->set_output_type(0, element::i32, std::vector(output_axis, -1)); unit_test::cpu_test_shape_infer(op.get(), input_shapes, output_shapes); } @@ -76,14 +85,15 @@ TEST_P(StridedSliceCpuShapeInferenceTest , shape_inference_in_const_map) { const auto begin = std::make_shared(element::i32, input_shapes[1].get_shape()); const auto end = std::make_shared(element::i32, input_shapes[2].get_shape()); const auto stride = std::make_shared(element::i32, input_shapes[3].get_shape()); - const auto op = make_op(arg, begin, end, stride, begin_mask, end_mask); + const auto op = make_op(arg, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_axis_mask); const auto begin_tensor = ov::Tensor(element::i32, input_shapes[1].get_shape(), data[BEGIN].data()); const auto end_tensor = ov::Tensor(element::i32, input_shapes[2].get_shape(), data[END].data()); const auto stride_tensor = ov::Tensor(element::i32, input_shapes[3].get_shape(), data[STRIDE].data()); const std::unordered_map constant_data = {{1, begin_tensor}, {2, end_tensor}, {3, stride_tensor}}; // implementation depends on some output information of the op - op->set_output_type(0, element::i32, {-1, -1, -1}); + auto output_axis = output_shapes[0].to_shape().size(); + op->set_output_type(0, element::i32, std::vector(output_axis, -1)); unit_test::cpu_test_shape_infer(op.get(), input_shapes, output_shapes, constant_data); } @@ -91,19 +101,28 @@ INSTANTIATE_TEST_SUITE_P( CpuShapeInfer, StridedSliceCpuShapeInferenceTest, Values(make_tuple(unit_test::ShapeVector{{3, 4, 5}, {3}, {3}, {3}}, std::vector>{{100, 100, 100}, {-100, -100, -100}, {-1, -1, -1}}, - std::vector(4, 0), std::vector(4, 0), StaticShape({3, 4, 5})), + std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), StaticShape({3, 4, 5})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{1, 0, 0}, {2, 1, 3}, {1, 1, 1}}, - std::vector(4, 0), std::vector(4, 0), StaticShape({1, 1, 3})), + std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), StaticShape({1, 1, 3})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{1, 0, 0}, {2, 2, 3}, {1, 1, 1}}, - std::vector(4, 0), std::vector(4, 0), StaticShape({1, 2, 3})), + std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), StaticShape({1, 2, 3})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{2, 0, 0}, {3, 2, 3}, {1, 1, 2}}, - std::vector(4, 0), std::vector(4, 0), StaticShape({1, 2, 2})), + std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), std::vector(4, 0), StaticShape({1, 2, 2})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{1, 0, 0}, {0, 0, 0}, {1, 1, 1}}, - std::vector{0, 1, 1}, std::vector(3, 1), StaticShape({2, 2, 3})), + std::vector{0, 1, 1}, std::vector(3, 1), std::vector(3, 0), std::vector(3, 0), + StaticShape({2, 2, 3})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{0, 1, 0}, {2, 0, 0}, {1, 1, 2}}, - std::vector{1, 0, 1}, std::vector{0, 1, 1}, StaticShape({2, 1, 2})), + std::vector{1, 0, 1}, std::vector{0, 1, 1}, std::vector(3, 0), std::vector(3, 0), + StaticShape({2, 1, 2})), make_tuple(unit_test::ShapeVector{{3, 2, 3}, {3}, {3}, {3}}, std::vector>{{0, 0, 0}, {1, 0, 0}, {1, 1, -1}}, - std::vector{0, 1, 1}, std::vector{0, 1, 1}, StaticShape({1, 2, 3}))), + std::vector{0, 1, 1}, std::vector{0, 1, 1}, std::vector(3, 0), std::vector(3, 0), + StaticShape({1, 2, 3})), + make_tuple(unit_test::ShapeVector{{2000, 128}, {3}, {3}, {3}}, std::vector>{{0, 0, 0}, {0, 128, 0}, {1, 1, 1}}, + std::vector{0, 1, 1}, std::vector{0, 0, 1}, std::vector{1, 0, 0}, std::vector(3, 0), + StaticShape({1, 128, 128})), + make_tuple(unit_test::ShapeVector{{1, 2, 3}, {2}, {2}, {2}}, std::vector>{{0, 0}, {0, 0}, {1, 1}}, + std::vector{1, 0}, std::vector{1, 0}, std::vector{0, 1}, std::vector{0, 0}, + StaticShape({1, 1, 2, 3}))), StridedSliceCpuShapeInferenceTest::getTestCaseName); } // namespace cpu_shape_infer From c202882011202b4731b0bb7254c18cececc071ba Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 24 Oct 2024 10:49:35 +0400 Subject: [PATCH 004/233] [GPU] Enable fusing dynamic eltwise with gemm in onednn case (#27193) --- .../src/graph/graph_optimizer/prepare_primitive_fusing.cpp | 3 ++- src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index c38fa70e86ccef..39cbc1aa89b4e2 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -1071,7 +1071,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { auto eltw_in_size = peer_node->get_output_layout(); if (eltw_in_size.is_dynamic() // this whitelist condition is temporarily and to be relaxed soon. - && !fused_node->is_type()) + && !fused_node->is_type() + && !fused_node->is_type()) return; } if (parent1.first->is_type() && !conv_supports_fusings(parent1.first->as())) diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp index 631b6879acdf2c..baed5400181130 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp @@ -419,9 +419,6 @@ class gemm_2in_dynamic_add : public gemm_2in_add {}; TEST_P(gemm_2in_dynamic_add, add) { auto p = GetParam(); - if (engine.get_device_info().supports_immad) - p.expected_fused_primitives++; - cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); From dd55426734e73369a890fbce0ac67a3d94e26c97 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 07:01:17 +0000 Subject: [PATCH 005/233] Bump actions/checkout from 4.1.7 to 4.2.2 (#27214) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.7 to 4.2.2.
Release notes

Sourced from actions/checkout's releases.

v4.2.2

What's Changed

Full Changelog: https://github.com/actions/checkout/compare/v4.2.1...v4.2.2

v4.2.1

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4.2.0...v4.2.1

v4.2.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4.1.7...v4.2.0

Changelog

Sourced from actions/checkout's changelog.

Changelog

v4.2.2

v4.2.1

v4.2.0

v4.1.7

v4.1.6

v4.1.5

v4.1.4

v4.1.3

v4.1.2

v4.1.1

v4.1.0

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=4.1.7&new-version=4.2.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/android_arm64.yml | 8 ++++---- .github/workflows/android_x64.yml | 8 ++++---- .github/workflows/build_doc.yml | 2 +- .github/workflows/check_pr_commits.yml | 2 +- .github/workflows/cleanup_caches.yml | 4 ++-- .github/workflows/code_snippets.yml | 2 +- .github/workflows/code_style.yml | 6 +++--- .github/workflows/coverage.yml | 2 +- .github/workflows/coverity.yml | 4 ++-- .github/workflows/debian_10_arm.yml | 4 ++-- .github/workflows/dependency_review.yml | 2 +- .github/workflows/dev_cpu_linux_snippets_libxsmm.yml | 8 ++++---- .github/workflows/fedora_29.yml | 4 ++-- .github/workflows/files_size.yml | 2 +- .github/workflows/job_build_linux.yml | 4 ++-- .github/workflows/job_build_windows.yml | 4 ++-- .github/workflows/job_cpu_functional_tests.yml | 2 +- .github/workflows/job_jax_models_tests.yml | 2 +- .github/workflows/job_onnx_runtime.yml | 4 ++-- .github/workflows/job_openvino_js.yml | 2 +- .github/workflows/job_python_unit_tests.yml | 4 ++-- .github/workflows/job_pytorch_layer_tests.yml | 2 +- .github/workflows/job_pytorch_models_tests.yml | 2 +- .github/workflows/job_samples_tests.yml | 2 +- .github/workflows/job_tensorflow_layer_tests.yml | 2 +- .github/workflows/job_tensorflow_models_tests.yml | 2 +- .github/workflows/job_tokenizers.yml | 4 ++-- .github/workflows/labeler.yml | 2 +- .github/workflows/linux_arm64.yml | 4 ++-- .github/workflows/linux_conditional_compilation.yml | 12 ++++++------ .github/workflows/linux_riscv.yml | 6 +++--- .github/workflows/linux_sanitizers.yml | 10 +++++----- .github/workflows/mac.yml | 6 +++--- .github/workflows/mac_arm64.yml | 6 +++--- .github/workflows/mo.yml | 2 +- .github/workflows/ovc.yml | 2 +- .github/workflows/py_checks.yml | 2 +- .../workflows/send_workflows_to_opentelemetry.yml | 2 +- .github/workflows/ubuntu_20.yml | 4 ++-- .github/workflows/ubuntu_22.yml | 8 ++++---- .github/workflows/ubuntu_22_dpcpp.yml | 4 ++-- .github/workflows/ubuntu_24.yml | 4 ++-- .github/workflows/webassembly.yml | 6 +++--- .../workflows/windows_conditional_compilation.yml | 12 ++++++------ .github/workflows/windows_vs2019_debug.yml | 2 +- .github/workflows/windows_vs2019_release.yml | 10 +++++----- .github/workflows/workflow_rerunner.yml | 4 ++-- 47 files changed, 101 insertions(+), 101 deletions(-) diff --git a/.github/workflows/android_arm64.yml b/.github/workflows/android_arm64.yml index 15094a84ee8f5f..fca16f2848f7bb 100644 --- a/.github/workflows/android_arm64.yml +++ b/.github/workflows/android_arm64.yml @@ -24,7 +24,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -53,7 +53,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -98,7 +98,7 @@ jobs: SCCACHE_AZURE_KEY_PREFIX: android_arm64 steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' @@ -116,7 +116,7 @@ jobs: popd - name: Clone vcpkg - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'microsoft/vcpkg' ref: ${{ env.VCPKG_VERSION }} diff --git a/.github/workflows/android_x64.yml b/.github/workflows/android_x64.yml index cebaa9177b69b9..a667a07da5bd3e 100644 --- a/.github/workflows/android_x64.yml +++ b/.github/workflows/android_x64.yml @@ -27,7 +27,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -56,7 +56,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -97,13 +97,13 @@ jobs: SCCACHE_AZURE_KEY_PREFIX: android_x64 steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO GenAI - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino.genai' path: ${{ env.OPENVINO_GENAI_REPO }} diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index 535be1e4e70457..8bf61839116bbd 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -18,7 +18,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' lfs: 'true' diff --git a/.github/workflows/check_pr_commits.yml b/.github/workflows/check_pr_commits.yml index 5710736f322652..f7f66be299876c 100644 --- a/.github/workflows/check_pr_commits.yml +++ b/.github/workflows/check_pr_commits.yml @@ -9,7 +9,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install dependencies run: python3 -m pip install -r ./.github/github_org_control/requirements.txt diff --git a/.github/workflows/cleanup_caches.yml b/.github/workflows/cleanup_caches.yml index 53a426bfa32b42..3fc69b21374093 100644 --- a/.github/workflows/cleanup_caches.yml +++ b/.github/workflows/cleanup_caches.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Checkout cach action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/cache @@ -70,7 +70,7 @@ jobs: steps: - name: Checkout cach action - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/cache diff --git a/.github/workflows/code_snippets.yml b/.github/workflows/code_snippets.yml index 5384ece83fb207..9337fdff4b2905 100644 --- a/.github/workflows/code_snippets.yml +++ b/.github/workflows/code_snippets.yml @@ -28,7 +28,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index 777ba7694d0be5..a70d2641cb57f3 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -14,7 +14,7 @@ jobs: permissions: pull-requests: write steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' @@ -44,7 +44,7 @@ jobs: permissions: pull-requests: write steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' @@ -76,7 +76,7 @@ jobs: runs-on: ubuntu-22.04 if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 3aecbe2367da05..fdb41226f4efb8 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -32,7 +32,7 @@ jobs: max-size: 50G - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: 'true' diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 8a2338554faae3..d87de4257e0270 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -51,14 +51,14 @@ jobs: apt-get install --assume-yes --no-install-recommends git ca-certificates - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' ref: ${{ inputs.openvinoRef }} - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/debian_10_arm.yml b/.github/workflows/debian_10_arm.yml index f4db0a83a5a39f..73426222253adb 100644 --- a/.github/workflows/debian_10_arm.yml +++ b/.github/workflows/debian_10_arm.yml @@ -24,7 +24,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -58,7 +58,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/dependency_review.yml b/.github/workflows/dependency_review.yml index 3dcd9a367b018c..59a1eaa6e1c26f 100644 --- a/.github/workflows/dependency_review.yml +++ b/.github/workflows/dependency_review.yml @@ -9,7 +9,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Dependency Review uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 diff --git a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml index 26e8400c22a04f..2f6d646f8e271d 100644 --- a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml +++ b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml @@ -32,7 +32,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -65,7 +65,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -109,7 +109,7 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -295,7 +295,7 @@ jobs: popd - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/fedora_29.yml b/.github/workflows/fedora_29.yml index 0ec0c409d12e0b..3f685502747a19 100644 --- a/.github/workflows/fedora_29.yml +++ b/.github/workflows/fedora_29.yml @@ -24,7 +24,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -58,7 +58,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/files_size.yml b/.github/workflows/files_size.yml index 3733ad48ca49d2..2768e731b6578b 100644 --- a/.github/workflows/files_size.yml +++ b/.github/workflows/files_size.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: git ls-tree run: git ls-tree -r -t -l --full-name HEAD | sort -n -r -k 4 diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml index 86545b6e9e7a43..3e5fa88f867ec8 100644 --- a/.github/workflows/job_build_linux.yml +++ b/.github/workflows/job_build_linux.yml @@ -91,7 +91,7 @@ jobs: PRODUCT_TYPE: public_linux_${{ inputs.os }}_${{ inputs.arch }}_release steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -106,7 +106,7 @@ jobs: git rev-parse HEAD - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/job_build_windows.yml b/.github/workflows/job_build_windows.yml index df2544d9d9e60a..de7f662bed1dc5 100644 --- a/.github/workflows/job_build_windows.yml +++ b/.github/workflows/job_build_windows.yml @@ -51,13 +51,13 @@ jobs: PRODUCT_TYPE: 'public_windows_vs2019_${{ inputs.build-type }}' steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml index e197d581f290a4..9e8dde29f7701d 100644 --- a/.github/workflows/job_cpu_functional_tests.yml +++ b/.github/workflows/job_cpu_functional_tests.yml @@ -71,7 +71,7 @@ jobs: popd - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml index ea2669071386dd..43fa8f2a7f1740 100644 --- a/.github/workflows/job_jax_models_tests.yml +++ b/.github/workflows/job_jax_models_tests.yml @@ -64,7 +64,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml index 0ceb080d82184d..966d258a2fc609 100644 --- a/.github/workflows/job_onnx_runtime.yml +++ b/.github/workflows/job_onnx_runtime.yml @@ -63,7 +63,7 @@ jobs: popd - name: Fetch ONNX runtime version and skip tests list - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | src/frontends/onnx/tests/ci_utils/onnxruntime @@ -77,7 +77,7 @@ jobs: working-directory: ${{ env.ONNX_RUNTIME_UTILS }} - name: Clone ONNX Runtime - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'microsoft/onnxruntime' path: ${{ env.ONNX_RUNTIME_REPO }} diff --git a/.github/workflows/job_openvino_js.yml b/.github/workflows/job_openvino_js.yml index e722af78832c12..6097d3e6f18bc4 100644 --- a/.github/workflows/job_openvino_js.yml +++ b/.github/workflows/job_openvino_js.yml @@ -32,7 +32,7 @@ jobs: DISPLAY: ':99' steps: - name: Fetch OpenVINO JS sources - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | src/bindings/js diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index 1fafafd7623545..1dd6ebbfc204d8 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -76,7 +76,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -248,7 +248,7 @@ jobs: - name: Clone API snippets if: runner.os != 'macOS' - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: docs/articles_en/assets/snippets path: ${{ env.OPENVINO_REPO }} diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml index c6cd97422f2b95..4ea6d33336fc87 100644 --- a/.github/workflows/job_pytorch_layer_tests.yml +++ b/.github/workflows/job_pytorch_layer_tests.yml @@ -83,7 +83,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml index 22a09dffba779f..dd15ae183d692c 100644 --- a/.github/workflows/job_pytorch_models_tests.yml +++ b/.github/workflows/job_pytorch_models_tests.yml @@ -77,7 +77,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml index 7cde4e6fd18eae..e144aa0cfb95aa 100644 --- a/.github/workflows/job_samples_tests.yml +++ b/.github/workflows/job_samples_tests.yml @@ -68,7 +68,7 @@ jobs: - name: Fetch setup_python action # Python is already installed on Ubuntu within Dockerfile if: runner.os != 'Linux' - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml index 977b2e4f96af73..26730f9b55df7a 100644 --- a/.github/workflows/job_tensorflow_layer_tests.yml +++ b/.github/workflows/job_tensorflow_layer_tests.yml @@ -83,7 +83,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_tensorflow_models_tests.yml b/.github/workflows/job_tensorflow_models_tests.yml index 0990eae3de6e7e..5321beb8703de1 100644 --- a/.github/workflows/job_tensorflow_models_tests.yml +++ b/.github/workflows/job_tensorflow_models_tests.yml @@ -69,7 +69,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml index f7388eb98a2f3c..14243bda13531a 100644 --- a/.github/workflows/job_tokenizers.yml +++ b/.github/workflows/job_tokenizers.yml @@ -52,7 +52,7 @@ jobs: echo "EXTENSION_BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV" - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python @@ -72,7 +72,7 @@ jobs: self-hosted-runner: ${{ runner.os == 'Linux' }} - name: Clone OpenVINO Tokenizers - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_tokenizers' path: ${{ env.OPENVINO_TOKENIZERS_REPO }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 4c631b673f8cc2..00f3a321e0dd1f 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -26,7 +26,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Checkout Labeller Script - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: '.github' diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 0345d5259e8182..67cd4d0d1a5d84 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -28,7 +28,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -62,7 +62,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml index 42d7810b9f1663..acb7e1271d7a34 100644 --- a/.github/workflows/linux_conditional_compilation.yml +++ b/.github/workflows/linux_conditional_compilation.yml @@ -29,7 +29,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -63,7 +63,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -109,13 +109,13 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone test models - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/testdata' path: ${{ env.MODELS_PATH }} @@ -282,13 +282,13 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone test models - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/testdata' path: ${{ env.MODELS_PATH }} diff --git a/.github/workflows/linux_riscv.yml b/.github/workflows/linux_riscv.yml index f67b011b0caed5..c450a5d30768e4 100644 --- a/.github/workflows/linux_riscv.yml +++ b/.github/workflows/linux_riscv.yml @@ -28,7 +28,7 @@ jobs: target_branch: ${{ steps.set_target_branch.outputs.target_branch }} steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -63,7 +63,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -102,7 +102,7 @@ jobs: if: ${{ github.event_name != 'merge_group' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml index e098b637150834..4bb597d83fadc8 100644 --- a/.github/workflows/linux_sanitizers.yml +++ b/.github/workflows/linux_sanitizers.yml @@ -24,7 +24,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -52,7 +52,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -107,13 +107,13 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} @@ -280,7 +280,7 @@ jobs: popd - name: Fetch Sanitizer Suppression Lists - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | tests/sanitizers/lsan/suppressions.txt diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index bddbaab134fa9c..7c47c1c635c2f8 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -42,7 +42,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -82,13 +82,13 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 576eefde8c9b4a..81cd229d1dd9f6 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -42,7 +42,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -82,13 +82,13 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml index 151227f111c9e0..0f2fb5fd57752a 100644 --- a/.github/workflows/mo.yml +++ b/.github/workflows/mo.yml @@ -24,7 +24,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 diff --git a/.github/workflows/ovc.yml b/.github/workflows/ovc.yml index ee5f3e58e363e6..2745b2f232406c 100644 --- a/.github/workflows/ovc.yml +++ b/.github/workflows/ovc.yml @@ -19,7 +19,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 diff --git a/.github/workflows/py_checks.yml b/.github/workflows/py_checks.yml index 2b0d3f2272787f..13ddbaaa1ec41c 100644 --- a/.github/workflows/py_checks.yml +++ b/.github/workflows/py_checks.yml @@ -28,7 +28,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 diff --git a/.github/workflows/send_workflows_to_opentelemetry.yml b/.github/workflows/send_workflows_to_opentelemetry.yml index 687b79d2606bec..ba38d6a9f90fed 100644 --- a/.github/workflows/send_workflows_to_opentelemetry.yml +++ b/.github/workflows/send_workflows_to_opentelemetry.yml @@ -41,7 +41,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: '.github' diff --git a/.github/workflows/ubuntu_20.yml b/.github/workflows/ubuntu_20.yml index 6056c20945c801..63a1fab87d566f 100644 --- a/.github/workflows/ubuntu_20.yml +++ b/.github/workflows/ubuntu_20.yml @@ -30,7 +30,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -64,7 +64,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 5e5ac3c3482624..753708d9b3ba51 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -32,7 +32,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -66,7 +66,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -183,7 +183,7 @@ jobs: popd - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -459,7 +459,7 @@ jobs: popd - name: Clone OpenVINO Contrib - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/ubuntu_22_dpcpp.yml b/.github/workflows/ubuntu_22_dpcpp.yml index 9ca27262a5dcde..48230155f7e903 100644 --- a/.github/workflows/ubuntu_22_dpcpp.yml +++ b/.github/workflows/ubuntu_22_dpcpp.yml @@ -20,7 +20,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -54,7 +54,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index bb147450438160..9d9aba6739f22f 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -27,7 +27,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -61,7 +61,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/webassembly.yml b/.github/workflows/webassembly.yml index c4d835637352ad..45d6c9ce98317a 100644 --- a/.github/workflows/webassembly.yml +++ b/.github/workflows/webassembly.yml @@ -24,7 +24,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -58,7 +58,7 @@ jobs: images: "${{ steps.handle_docker.outputs.images }}" steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/handle_docker id: handle_docker @@ -91,7 +91,7 @@ jobs: SCCACHE_AZURE_KEY_PREFIX: webassembly_Release steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml index 30b2ce2f20df38..fb53404a736558 100644 --- a/.github/workflows/windows_conditional_compilation.yml +++ b/.github/workflows/windows_conditional_compilation.yml @@ -30,7 +30,7 @@ jobs: skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -73,13 +73,13 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone test models - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/testdata' path: 'testdata' @@ -282,13 +282,13 @@ jobs: steps: - name: Clone OpenVINO - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: 'openvino' submodules: 'true' - name: Clone test models - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'openvinotoolkit/testdata' path: 'testdata' @@ -369,7 +369,7 @@ jobs: run: Expand-Archive ${{ env.INSTALL_TEST_DIR }}/openvino_tests.zip -DestinationPath "${{ env.INSTALL_TEST_DIR }}" - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/windows_vs2019_debug.yml b/.github/workflows/windows_vs2019_debug.yml index f36c3f33031f7d..125308cf4182c0 100644 --- a/.github/workflows/windows_vs2019_debug.yml +++ b/.github/workflows/windows_vs2019_debug.yml @@ -25,7 +25,7 @@ jobs: target_branch: ${{ steps.set_target_branch.outputs.target_branch }} steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index bce90165408815..22a91f5dc69b49 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -28,7 +28,7 @@ jobs: target_branch: ${{ steps.set_target_branch.outputs.target_branch }} steps: - name: checkout action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: .github/actions/smart-ci @@ -98,7 +98,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -165,7 +165,7 @@ jobs: steps: - name: Fetch OpenVINO JS sources - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | src/bindings/js @@ -263,7 +263,7 @@ jobs: working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -463,7 +463,7 @@ jobs: popd - name: Fetch setup_python action - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml index 89c39669e67720..55ecc2500635b1 100644 --- a/.github/workflows/workflow_rerunner.yml +++ b/.github/workflows/workflow_rerunner.yml @@ -37,7 +37,7 @@ jobs: checks: read steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: '.github/scripts/workflow_rerun' @@ -72,7 +72,7 @@ jobs: runs-on: aks-linux-2-cores-8gb steps: - name: Checkout - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: '.github/scripts/workflow_rerun' lfs: true From 27780a6efe84b1982d1622c2faf254148708867f Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 24 Oct 2024 11:32:52 +0400 Subject: [PATCH 006/233] Revert "[GPU] Increasing conditions for using fake_aligned shapes (#26450)" (#27211) This reverts commit 03aaa413702e6c107a83faae453b6a1ff1af93fe. ### Tickets: - *[155861](https://jira.devtools.intel.com/browse/CVS-155861)* --- src/plugins/intel_gpu/src/graph/fully_connected.cpp | 5 ----- .../unit/fake_alignment/fc_fake_alignment_test.cpp | 10 +++++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 78ae0386c7b115..bc1e3e2e82b3ca 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -244,11 +244,6 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par return std::move(orig_impl_param); } - if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu && - batch_size <= 91 && input_shape.back() >= 512) { - return std::move(orig_impl_param); - } - size_t fake_align_base = 8; if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) { auto weights_layout_dt = orig_impl_param.weights_layout.value().data_type; diff --git a/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp b/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp index 63023fc53184a2..6e3472bc48a80d 100644 --- a/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp @@ -83,13 +83,13 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test, layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu }, { - layout{ov::PartialShape{92, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout + layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout data_types::f16, - layout{ov::PartialShape{96, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu - layout{ov::PartialShape{96, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu - layout{ov::PartialShape{96, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu - layout{ov::PartialShape{96, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu + layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu }, { From 2a9c69d4ac4b44bbf5dab3ef7c7e956bee592a6e Mon Sep 17 00:00:00 2001 From: Zhiyuan Tan <66934674+BHbean@users.noreply.github.com> Date: Thu, 24 Oct 2024 15:57:41 +0800 Subject: [PATCH 007/233] [RISCV64] add nhwc layout support for eltwise executor (#26531) ### Details: - *Add `nhwc` layout support for shl eltwise executor* - *Enable some tests with `nhwc` layout* ### Tickets: - *N/A* --- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 2 + .../src/nodes/executors/shl/shl_eltwise.cpp | 39 +++++++++++-------- .../src/common/concat_conv_sum_inplace.cpp | 2 - .../utils/riscv64/filter_cpu_info.cpp | 9 +++-- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index ed4d936fa49ae6..5c3a358dff9d38 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -2583,6 +2583,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() { supportedPrimitiveDescriptors.emplace_back(nodeDesc); }; + if (isChannelsFirstApplicable) + addDesc(supportedPrimitiveDescriptors, ChannelsFirst); addDesc(supportedPrimitiveDescriptors, Planar); canUseEltwiseExecPtr = !supportedPrimitiveDescriptors.empty(); diff --git a/src/plugins/intel_cpu/src/nodes/executors/shl/shl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/shl/shl_eltwise.cpp index 9506fa74505636..54f00ba20538b3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/shl/shl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/shl/shl_eltwise.cpp @@ -6,21 +6,11 @@ #include "shl_utils.hpp" #include "csinn/csi_nn.h" #include "utils/debug_capabilities.h" +#include "memory_desc/cpu_blocked_memory_desc.h" namespace ov { namespace intel_cpu { -inline void log_unsupported_prec(const std::vector& srcDescs, - const std::vector& dstDescs, - const Algorithm eltwiseAlgorithm) { - std::string srcPrec; - for (size_t i = 0; i < srcDescs.size(); i++) { - srcPrec += srcDescs[i]->getPrecision().to_string() + " "; - } - DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec, - "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported"); -} - bool ShlEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { if (one_of(algorithm, Algorithm::EltwiseAdd, Algorithm::EltwiseSubtract, @@ -53,6 +43,26 @@ bool ShlEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, return false; } + // check whether input and output layouts are equal + if(srcDescs.front()->hasLayoutType(LayoutType::nCsp16c) || srcDescs.front()->hasLayoutType(LayoutType::nCsp8c)) { + DEBUG_LOG("ShlEltwise does not support 'nCsp16c' or 'nCsp8c' layouts"); + return false; + } + const auto unifiedLayout = srcDescs.front()->hasLayoutType(LayoutType::ncsp) ? LayoutType::ncsp : LayoutType::nspc; + const auto unifiedRank = srcDescs.front()->as()->getBlockDims().size(); + auto has_unified_layout = [unifiedLayout, unifiedRank](const MemoryDescPtr& desc) { + if (desc->hasLayoutType(LayoutType::nspc)) { // ensure the same rank + if (desc->as()->getBlockDims().size() != unifiedRank) + return false; + } + return desc->hasLayoutType(unifiedLayout); + }; + if (!(std::all_of(srcDescs.cbegin(), srcDescs.cend(), has_unified_layout) && + std::all_of(dstDescs.cbegin(), dstDescs.cend(), has_unified_layout))) { + DEBUG_LOG("ShlEltwise needs to ensure all inputs and outputs are in the same 'ncsp' or 'nspc' layouts"); + return false; + } + for (const auto& srcDesc : srcDescs) { csinn_layout_enum supportedLayout = getShlDataLayoutByMemoryDesc(srcDesc); switch (eltwiseAttrs.algorithm) { @@ -93,14 +103,11 @@ bool ShlEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, srcTensors = std::vector(srcDescs.size()); dstTensors = std::vector(dstDescs.size()); - // Allocate Shl session - sess = ShlSession(); - for (size_t i = 0; i < srcDescs.size(); i++) { - srcTensors[i] = ShlTensor(sess, precisionToShlDataType(srcDescs[i]->getPrecision()), getShlDataLayoutByMemoryDesc(srcDescs[i]), srcDescs[i]->getShape().getStaticDims()); + srcTensors[i] = ShlTensor(sess, precisionToShlDataType(srcDescs[i]->getPrecision()), getShlDataLayoutByMemoryDesc(srcDescs[i]), srcDescs[i]->as()->getBlockDims()); } for (size_t i = 0; i < dstDescs.size(); i++) { - dstTensors[i] = ShlTensor(sess, precisionToShlDataType(dstDescs[i]->getPrecision()), getShlDataLayoutByMemoryDesc(dstDescs[i]), dstDescs[i]->getShape().getStaticDims()); + dstTensors[i] = ShlTensor(sess, precisionToShlDataType(dstDescs[i]->getPrecision()), getShlDataLayoutByMemoryDesc(dstDescs[i]), dstDescs[i]->as()->getBlockDims()); } std::function initFunc = nullptr; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_conv_sum_inplace.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_conv_sum_inplace.cpp index 7000812e6f672e..ffd87f159cc38e 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_conv_sum_inplace.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_conv_sum_inplace.cpp @@ -48,8 +48,6 @@ class ReLuConcatConvSumInPlaceTest : virtual public SubgraphBaseStaticTest { const size_t convOutChannels = 64; #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) const auto targetFormat = with_cpu_x86_avx512_core() ? nChw16c : nChw8c; -#elif defined(OV_CPU_WITH_SHL) - const auto targetFormat = nchw; #else const auto targetFormat = nhwc; #endif diff --git a/src/plugins/intel_cpu/tests/functional/utils/riscv64/filter_cpu_info.cpp b/src/plugins/intel_cpu/tests/functional/utils/riscv64/filter_cpu_info.cpp index 72a3d07f2640f4..71360dca2c92e0 100644 --- a/src/plugins/intel_cpu/tests/functional/utils/riscv64/filter_cpu_info.cpp +++ b/src/plugins/intel_cpu/tests/functional/utils/riscv64/filter_cpu_info.cpp @@ -64,9 +64,12 @@ std::vector filterCPUInfoForDeviceWithFP16(const std::vector< } std::vector filterCPUSpecificParams(const std::vector ¶msVector) { - static const std::vector supported_f = {CPUTestUtils::cpu_memory_format_t::ncw, - CPUTestUtils::cpu_memory_format_t::nchw, - CPUTestUtils::cpu_memory_format_t::ncdhw}; + static const std::vector supported_f = {CPUTestUtils::cpu_memory_format_t::nwc, + CPUTestUtils::cpu_memory_format_t::ncw, + CPUTestUtils::cpu_memory_format_t::nchw, + CPUTestUtils::cpu_memory_format_t::nhwc, + CPUTestUtils::cpu_memory_format_t::ndhwc, + CPUTestUtils::cpu_memory_format_t::ncdhw}; std::vector filteredParamsVector = paramsVector; filteredParamsVector.erase(std::remove_if(filteredParamsVector.begin(), filteredParamsVector.end(), From 17f1601760578807d2b03155984c230961620eee Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 24 Oct 2024 08:38:13 +0000 Subject: [PATCH 008/233] [GPU][TESTS] Fix weightless caching tests (#27163) Currently the tests have weightless caching enabled, but the models tested contain no constants eligible for weightless caching. This fixes the situation. To elaborate - all the models have f32 dtype constants by default. In these particular models all of the constants which would otherwise be eligible for weightless caching undergo some precision transformation which invalidates this eligibilty. The PR changes the dtype of the constants to prevent that and deletes the models which contain no constants eligible for weightless caching even after the dtype change. --- .../tests/functional/behavior/model_cache.cpp | 101 +----------------- 1 file changed, 3 insertions(+), 98 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index 573d275da84e51..bcb6be2fe307e7 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -97,113 +97,18 @@ void CheckWeightlessCacheAccuracy::run() { } } -TEST_F(CheckWeightlessCacheAccuracy, 2InputSubtract) { - model = ov::test::utils::make_2_input_subtract(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConcatWithParams) { - model = ov::test::utils::make_concat_with_params(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConvBias) { - model = ov::test::utils::make_conv_bias(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConvPoolRelu) { - model = ov::test::utils::make_conv_pool_relu(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConvPoolReluNoReshapes) { - model = ov::test::utils::make_conv_pool_relu_no_reshapes(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConvPoolReluNonZero) { - model = ov::test::utils::make_conv_pool_relu_non_zero(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, ConvertTranspose) { - model = ov::test::utils::make_convert_transpose(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, DetectionOutput) { - model = ov::test::utils::make_detection_output(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, KsoFunction) { - model = ov::test::utils::make_kso_function(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, MatmulBias) { - model = ov::test::utils::make_matmul_bias(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, MultiSingleConv) { - model = ov::test::utils::make_multi_single_conv(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, MultipleInputOutputDoubleConcat) { - model = ov::test::utils::make_multiple_input_output_double_concat(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, NestedBranchConvConcat) { - model = ov::test::utils::make_nested_branch_conv_concat(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, NestedSplitConvConcat) { - model = ov::test::utils::make_nested_split_conv_concat(); - run(); -} - TEST_F(CheckWeightlessCacheAccuracy, ReadConcatSplitAssign) { - model = ov::test::utils::make_read_concat_split_assign(); + model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, ov::element::f16); run(); } TEST_F(CheckWeightlessCacheAccuracy, SingleConcatWithConstant) { - model = ov::test::utils::make_single_concat_with_constant(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, SingleConv) { - model = ov::test::utils::make_single_conv(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, SingleSplit) { - model = ov::test::utils::make_single_split(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, SplitConcat) { - model = ov::test::utils::make_split_concat(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, SplitConvConcat) { - model = ov::test::utils::make_split_conv_concat(); - run(); -} - -TEST_F(CheckWeightlessCacheAccuracy, SplitMultiConvConcat) { - model = ov::test::utils::make_split_multi_conv_concat(); + model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, ov::element::f16); run(); } TEST_F(CheckWeightlessCacheAccuracy, TiWithLstmCell) { - model = ov::test::utils::make_ti_with_lstm_cell(); + model = ov::test::utils::make_ti_with_lstm_cell(ov::element::f16); run(); } From 65a8f393ba01db7ae40606d29045518fa9d55c74 Mon Sep 17 00:00:00 2001 From: Attila Csok Date: Thu, 24 Oct 2024 12:52:40 +0300 Subject: [PATCH 009/233] [intel-npu] Implementing LUID property in npu plugin (#27091) ### Details: - Implements ov::device::LUID property in npu_plugin (works on windows only) ### Tickets: - EISW-140889 --- .../src/backend/include/zero_backend.hpp | 1 + .../src/backend/include/zero_device.hpp | 3 +++ .../intel_npu/src/backend/include/zero_init.hpp | 11 +++++++++++ .../intel_npu/src/backend/src/zero_backend.cpp | 8 ++++++-- .../intel_npu/src/backend/src/zero_device.cpp | 16 ++++++++++++++++ .../intel_npu/src/backend/src/zero_init.cpp | 6 ++++++ .../src/common/include/intel_npu/common/npu.hpp | 3 +++ src/plugins/intel_npu/src/common/src/npu.cpp | 4 ++++ .../intel_npu/src/plugin/include/backends.hpp | 1 + .../intel_npu/src/plugin/include/metrics.hpp | 1 + .../intel_npu/src/plugin/src/backends.cpp | 8 ++++++++ src/plugins/intel_npu/src/plugin/src/metrics.cpp | 11 +++++++++++ src/plugins/intel_npu/src/plugin/src/plugin.cpp | 7 +++++++ 13 files changed, 78 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp index ba187da8faf689..68e4f9434418a6 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp @@ -27,6 +27,7 @@ class ZeroEngineBackend final : public IEngineBackend { bool isBatchingSupported() const override; bool isCommandQueueExtSupported() const override; + bool isLUIDExtSupported() const override; void* getContext() const override; void* getDriverHandle() const; diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp index 1bf5d391d23533..e87a602613a92a 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_device.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_device.hpp @@ -26,6 +26,7 @@ class ZeroDevice : public IDevice { std::string getName() const override; std::string getFullDeviceName() const override; Uuid getUuid() const override; + ov::device::LUID getLUID() const override; uint32_t getSubDevId() const override; uint32_t getMaxNumSlices() const override; uint64_t getAllocMemSize() const override; @@ -67,6 +68,8 @@ class ZeroDevice : public IDevice { ze_pci_ext_properties_t pci_properties = {}; + ze_device_luid_ext_properties_t device_luid = {}; + std::map device_gops = {{ov::element::f32, 0.f}, {ov::element::f16, 0.f}, {ov::element::bf16, 0.f}, diff --git a/src/plugins/intel_npu/src/backend/include/zero_init.hpp b/src/plugins/intel_npu/src/backend/include/zero_init.hpp index 7404417a17f649..28dea52ae067f7 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_init.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_init.hpp @@ -52,6 +52,16 @@ class ZeroInitStructsHolder final { inline uint32_t getMutableCommandListVersion() const { return mutable_command_list_version; } + // Helper function to check if extension with exists and its newer than + inline bool isExtensionSupported(std::string ext_name, uint32_t version) const { + auto iter = driver_extension_properties.find(ext_name); + if (iter == driver_extension_properties.end()) { + return false; + } else if (iter->second >= version) { + return true; + } + return false; + } private: static const ze_driver_uuid_t uuid; @@ -61,6 +71,7 @@ class ZeroInitStructsHolder final { ze_driver_handle_t driver_handle = nullptr; ze_device_handle_t device_handle = nullptr; + std::map driver_extension_properties; std::unique_ptr graph_dditable_ext_decorator; std::unique_ptr command_queue_npu_dditable_ext_decorator; std::unique_ptr graph_profiling_npu_dditable_ext_decorator; diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp index 01e425aba61132..d74692400b0d90 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp @@ -30,11 +30,15 @@ uint32_t ZeroEngineBackend::getGraphExtVersion() const { } bool ZeroEngineBackend::isBatchingSupported() const { - return _instance->getGraphDdiTable().version() >= ZE_GRAPH_EXT_VERSION_1_6; + return _instance->isExtensionSupported(std::string(ZE_GRAPH_EXT_NAME_1_6), ZE_MAKE_VERSION(1, 6)); } bool ZeroEngineBackend::isCommandQueueExtSupported() const { - return _instance->getCommandQueueDdiTable().version() >= ZE_COMMAND_QUEUE_NPU_EXT_VERSION_1_0; + return _instance->isExtensionSupported(std::string(ZE_COMMAND_QUEUE_NPU_EXT_NAME), ZE_MAKE_VERSION(1, 0)); +} + +bool ZeroEngineBackend::isLUIDExtSupported() const { + return _instance->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0)); } ZeroEngineBackend::~ZeroEngineBackend() = default; diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 4a849462a8a355..ac60e4741947bd 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -20,6 +20,12 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs log("ZeroDevice", Logger::global().level()) { log.debug("ZeroDevice::ZeroDevice init"); device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + + // Get LUID info, if supported + if (_initStructs->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0))) { + device_luid.stype = ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES; + device_properties.pNext = &device_luid; + } THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", zeDeviceGetProperties(_initStructs->getDevice(), &device_properties)); @@ -128,6 +134,16 @@ IDevice::Uuid ZeroDevice::getUuid() const { return uuid; } +ov::device::LUID ZeroDevice::getLUID() const { + ov::device::LUID luidstruct; + // incompatibility check + static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, "LUID size mismatch"); + for (int i = 0; i < ZE_MAX_DEVICE_LUID_SIZE_EXT; i++) { + luidstruct.luid[i] = device_luid.luid.id[i]; + } + return luidstruct; +} + uint32_t ZeroDevice::getSubDevId() const { return device_properties.subdeviceId; } diff --git a/src/plugins/intel_npu/src/backend/src/zero_init.cpp b/src/plugins/intel_npu/src/backend/src/zero_init.cpp index 44a48714dda404..e418fcc5f58cc2 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_init.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_init.cpp @@ -105,6 +105,12 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGetExtensionProperties", zeDriverGetExtensionProperties(driver_handle, &count, extProps.data())); + // save the list of extension properties for later searches + for (auto it = extProps.begin(); it != extProps.end(); ++it) { + ze_driver_extension_properties_t p = *it; + driver_extension_properties.emplace(std::string(p.name), p.version); + } + // Query our graph extension version std::string graph_ext_name; uint32_t graph_ext_version = 0; diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp index 3259a51b25bc97..8c1eb57fe34fc3 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp @@ -39,6 +39,8 @@ class IEngineBackend : public std::enable_shared_from_this { virtual bool isBatchingSupported() const = 0; /** @brief Backend has support for workload type */ virtual bool isCommandQueueExtSupported() const = 0; + /** @brief Backend has support for LUID info */ + virtual bool isLUIDExtSupported() const = 0; /** @brief Register backend-specific options */ virtual void registerOptions(OptionsDesc& options) const; /** @brief Get Level Zero context*/ @@ -72,6 +74,7 @@ class IDevice : public std::enable_shared_from_this { virtual std::string getName() const = 0; virtual std::string getFullDeviceName() const = 0; virtual Uuid getUuid() const; + virtual ov::device::LUID getLUID() const; virtual uint32_t getSubDevId() const; virtual uint32_t getMaxNumSlices() const; virtual uint64_t getAllocMemSize() const; diff --git a/src/plugins/intel_npu/src/common/src/npu.cpp b/src/plugins/intel_npu/src/common/src/npu.cpp index 9f1fc7be003730..0969b200ea09a5 100644 --- a/src/plugins/intel_npu/src/common/src/npu.cpp +++ b/src/plugins/intel_npu/src/common/src/npu.cpp @@ -43,6 +43,10 @@ IDevice::Uuid IDevice::getUuid() const { OPENVINO_THROW("Get UUID not supported"); } +ov::device::LUID IDevice::getLUID() const { + OPENVINO_THROW("Get LUID not supported"); +} + uint32_t IDevice::getSubDevId() const { OPENVINO_THROW("Get SubDevId is not supported"); } diff --git a/src/plugins/intel_npu/src/plugin/include/backends.hpp b/src/plugins/intel_npu/src/plugin/include/backends.hpp index f8568f676947cd..133be9786c26c0 100644 --- a/src/plugins/intel_npu/src/plugin/include/backends.hpp +++ b/src/plugins/intel_npu/src/plugin/include/backends.hpp @@ -34,6 +34,7 @@ class NPUBackends final { uint32_t getGraphExtVersion() const; bool isBatchingSupported() const; bool isCommandQueueExtSupported() const; + bool isLUIDExtSupported() const; void registerOptions(OptionsDesc& options) const; void* getContext() const; std::string getCompilationPlatform(const std::string_view platform, const std::string& deviceId) const; diff --git a/src/plugins/intel_npu/src/plugin/include/metrics.hpp b/src/plugins/intel_npu/src/plugin/include/metrics.hpp index 541bc39a6cd47f..d291595963803f 100644 --- a/src/plugins/intel_npu/src/plugin/include/metrics.hpp +++ b/src/plugins/intel_npu/src/plugin/include/metrics.hpp @@ -25,6 +25,7 @@ class Metrics final { const std::vector& SupportedMetrics() const; std::string GetFullDeviceName(const std::string& specifiedDeviceName) const; IDevice::Uuid GetDeviceUuid(const std::string& specifiedDeviceName) const; + ov::device::LUID GetDeviceLUID(const std::string& specifiedDeviceName) const; const std::vector& GetSupportedConfigKeys() const; const std::vector GetOptimizationCapabilities() const; const std::tuple& GetRangeForAsyncInferRequest() const; diff --git a/src/plugins/intel_npu/src/plugin/src/backends.cpp b/src/plugins/intel_npu/src/plugin/src/backends.cpp index 9212506e7ec61f..b528102a2b62f7 100644 --- a/src/plugins/intel_npu/src/plugin/src/backends.cpp +++ b/src/plugins/intel_npu/src/plugin/src/backends.cpp @@ -176,6 +176,14 @@ bool NPUBackends::isCommandQueueExtSupported() const { return false; } +bool NPUBackends::isLUIDExtSupported() const { + if (_backend != nullptr) { + return _backend->isLUIDExtSupported(); + } + + return false; +} + std::shared_ptr NPUBackends::getDevice(const std::string& specificName) const { _logger.debug("Searching for device %s to use started...", specificName.c_str()); // TODO iterate over all available backends diff --git a/src/plugins/intel_npu/src/plugin/src/metrics.cpp b/src/plugins/intel_npu/src/plugin/src/metrics.cpp index b6853bc5a9de9f..fe47c8b204908f 100644 --- a/src/plugins/intel_npu/src/plugin/src/metrics.cpp +++ b/src/plugins/intel_npu/src/plugin/src/metrics.cpp @@ -91,6 +91,17 @@ IDevice::Uuid Metrics::GetDeviceUuid(const std::string& specifiedDeviceName) con return IDevice::Uuid{}; } +ov::device::LUID Metrics::GetDeviceLUID(const std::string& specifiedDeviceName) const { + const auto devName = getDeviceName(specifiedDeviceName); + auto device = _backends->getDevice(devName); + if (device) { + return device->getLUID(); + } + return ov::device::LUID{{ + 0, + }}; +} + std::vector Metrics::GetCachingProperties() const { return _cachingProperties; } diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index f63bcc8bf3b96e..7054f20809fc7a 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -355,6 +355,13 @@ Plugin::Plugin() auto devUuid = _metrics->GetDeviceUuid(specifiedDeviceName); return decltype(ov::device::uuid)::value_type{devUuid}; }}}, + {ov::device::luid.name(), + {_backends->isLUIDExtSupported(), + ov::PropertyMutability::RO, + [&](const Config& config) { + const auto specifiedDeviceName = get_specified_device_name(config); + return _metrics->GetDeviceLUID(specifiedDeviceName); + }}}, // Add FULL_DEVICE_NAME and DEVICE_ARCHITECTURE in supported // properties list only in case of non-empty device list (#1424144d) {ov::device::architecture.name(), From de66f0eaea534528305bc4ee74b3412f8c5618c0 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 24 Oct 2024 13:54:58 +0400 Subject: [PATCH 010/233] [GPU] Handle runtime scale value for PagedAttention (#27204) ### Details: - Add support for non-constant scale input, as the current Paged Attention specification does not require this value to be strictly constant --- .../src/graph/impls/ocl/paged_attention.cpp | 42 +++++++++++++++++-- .../kernel_selector/cl_kernels/pa_sdpa_opt.cl | 9 +++- .../kernel_selector/cl_kernels/sdpa_opt.cl | 14 +++++-- .../kernels/sdpa/pa_sdpa_kernel_opt.cpp | 12 +++++- .../kernels/sdpa/sdpa_kernel_base.h | 2 +- .../kernels/sdpa/sdpa_kernel_opt.cpp | 4 +- .../src/plugin/ops/paged_attention.cpp | 9 ++-- 7 files changed, 78 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp index cfc1e17c87ac6e..9cf1a252564934 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp @@ -122,6 +122,10 @@ struct paged_attention_impl : multi_stage_primitive { instance.value_memory_ptr(), instance.subsequence_begins_memory_ptr() }; + if (!desc->scale_val.has_value()) { + args.inputs.push_back(instance.input_memory_ptr(9)); + } + if (desc->has_alibi) { args.inputs.push_back(instance.alibi_memory_ptr()); } @@ -144,6 +148,10 @@ struct paged_attention_impl : multi_stage_primitive { args.inputs.push_back(instance.subsequence_begins_memory_ptr()); } + if (!desc->scale_val.has_value()) { + args.inputs.push_back(instance.input_memory_ptr(9)); + } + if (desc->has_alibi) { args.inputs.push_back(instance.alibi_memory_ptr()); } @@ -343,8 +351,10 @@ struct paged_attention_impl : multi_stage_primitive { config.paged_attention_block_size = static_cast(paged_attention::block_size); if (desc->scale_val.has_value()) { - config.has_scale_val = true; + config.has_const_scale_val = true; config.scale_val = desc->scale_val.value(); + } else { + config.has_const_scale_val = false; } if (desc->heads_num != desc->kv_heads_num) { @@ -409,16 +419,22 @@ struct paged_attention_impl : multi_stage_primitive { } static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) { + const auto desc = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_dynamic); const auto& query_layout = impl_param.get_input_layout(0); const auto& key_layout = impl_param.get_input_layout(1); const auto& value_layout = impl_param.get_input_layout(2); const auto& subsequence_begins_layout = impl_param.get_input_layout(6); + const auto& scale_layout = impl_param.get_input_layout(9); const auto& alibi_layout = impl_param.get_input_layout(11); const auto has_alibi = alibi_layout.count() > 0; + const auto has_scale_input = !desc->scale_val.has_value(); auto inputs_number = 4; + if (has_scale_input) + inputs_number++; + if (has_alibi) inputs_number++; @@ -429,6 +445,9 @@ struct paged_attention_impl : multi_stage_primitive { params.inputs[input_idx++] = convert_data_tensor(value_layout); params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout); + if (has_scale_input) + params.inputs[input_idx++] = convert_data_tensor(scale_layout); + if (has_alibi) params.inputs[input_idx++] = convert_data_tensor(alibi_layout); @@ -446,8 +465,12 @@ struct paged_attention_impl : multi_stage_primitive { {0, out_offsets_map.at(0)}, }; + input_idx = 4; + if (has_scale_input) + in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(9)}); + if (has_alibi) - in_tensor_to_offset_map.insert({4, in_offsets_map.at(11)}); + in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(11)}); if ((stage == PagedAttentionStage::PREFILL || stage == PagedAttentionStage::MIXED) && !is_dynamic) params.conf.paged_attention_aligned_seq_len = get_aligned_seq_len(impl_param, stage); @@ -458,6 +481,7 @@ struct paged_attention_impl : multi_stage_primitive { } static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, const PagedAttentionStage& stage, bool is_dynamic = false) { + const auto desc = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_dynamic); const auto& query_layout = impl_param.get_input_layout(0); @@ -467,10 +491,15 @@ struct paged_attention_impl : multi_stage_primitive { const auto& block_indices_layout = impl_param.get_input_layout(7); const auto& block_indices_begins_layout = impl_param.get_input_layout(8); const auto& subsequence_begins_layout = impl_param.get_input_layout(6); + const auto& scale_layout = impl_param.get_input_layout(9); const auto& alibi_layout = impl_param.get_input_layout(11); const auto has_alibi = alibi_layout.count() > 0; + const auto has_scale_input = !desc->scale_val.has_value(); auto inputs_number = 7; + if (has_scale_input) + inputs_number++; + if (has_alibi) inputs_number++; @@ -485,6 +514,9 @@ struct paged_attention_impl : multi_stage_primitive { params.inputs[input_idx++] = convert_data_tensor(subsequence_begins_layout); params.conf = get_sdpa_configuration(impl_param); + if (has_scale_input) + params.inputs[input_idx++] = convert_data_tensor(scale_layout); + if (has_alibi) params.inputs[input_idx++] = convert_data_tensor(alibi_layout); @@ -513,8 +545,12 @@ struct paged_attention_impl : multi_stage_primitive { {0, out_offsets_map.at(0)}, }; + input_idx = 7; + if (has_scale_input) + in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(9)}); + if (has_alibi) - in_tensor_to_offset_map.insert({7, in_offsets_map.at(11)}); + in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(11)}); params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl index 22b561e3d78661..a3bdd7e12dcd49 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl @@ -37,8 +37,11 @@ KERNEL(pa_sdpa_opt)( #if MULTI_TOKENS_PROCESSING const __global INPUT6_TYPE* subsequence_begins, #endif +#if HAS_SCALE_INPUT + const __global SCALE_INPUT_TYPE* scale, +#endif #if HAS_ALIBI - const __global INPUT7_TYPE* alibi_slopes, + const __global ALIBI_INPUT_TYPE* alibi_slopes, #endif __global OUTPUT_TYPE* output, __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, @@ -117,6 +120,8 @@ KERNEL(pa_sdpa_opt)( // Apply scale value directly to the query input to improve accuracy in case of a high range of input data #ifdef SCALE_VAL q_val = TO_INPUT0_TYPE(SCALE_VAL) * q_val; +#else + q_val = *scale * q_val; #endif slm_query[query_idx_local] = q_val; @@ -133,6 +138,8 @@ KERNEL(pa_sdpa_opt)( // Apply scale value directly to the query input to improve accuracy in case of a high range of input data #ifdef SCALE_VAL q_val[i] = TO_INPUT0_TYPE(SCALE_VAL) * q_val[i]; +#else + q_val[i] = *scale * q_val[i]; #endif } #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 948bd3c0f1a305..748f79115262e0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -656,6 +656,14 @@ inline MASK_VECTOR_TYPE FUNC(load_attn_mask)(OPTIONAL_SHAPE_INFO_ARG return mask_vec; } +#if IS_PAGED_ATTENTION && HAS_ALIBI +#if HAS_SCALE_INPUT +#define ALIBI_TYPE INPUT5_TYPE +#else +#define ALIBI_TYPE INPUT4_TYPE +#endif +#endif + REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) KERNEL(sdpa_opt)( OPTIONAL_SHAPE_INFO_ARG @@ -664,15 +672,15 @@ KERNEL(sdpa_opt)( const __global INPUT2_TYPE* value_input, #if IS_PAGED_ATTENTION const __global INPUT3_TYPE* subsequence_begins, -#if HAS_ALIBI - const __global INPUT4_TYPE* alibi_slopes, -#endif #endif #if HAS_ATTN_MASK_INPUT const __global INPUT3_TYPE* attn_mask, #endif #if HAS_SCALE_INPUT const __global INPUT4_TYPE* scale, +#endif +#if IS_PAGED_ATTENTION && HAS_ALIBI + const __global ALIBI_TYPE* alibi_slopes, #endif __global OUTPUT_TYPE* output, #ifdef BEAM_TABLE_TYPE diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp index 161c37ab3d3bf7..63c5e74160f652 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp @@ -176,11 +176,19 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params& auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS ? 1 : 0; jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1)); - if (config.has_scale_val) + if (config.has_const_scale_val) { jit.AddConstant(MakeJitConstant("SCALE_VAL", config.scale_val)); + } else { + const size_t scale_input_idx = 7; + jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", 1)); + jit.Merge(MakeTypeJitConstants(params.inputs[scale_input_idx].GetDType(), "SCALE_INPUT")); + } - if (params.conf.has_alibi_input) + if (params.conf.has_alibi_input) { + const size_t alibi_input_idx = config.has_const_scale_val ? 7 : 8; jit.AddConstant(MakeJitConstant("HAS_ALIBI", 1)); + jit.Merge(MakeTypeJitConstants(params.inputs[alibi_input_idx].GetDType(), "ALIBI_INPUT")); + } if (kernel_idx == KernelsTypes::MULTI_TOKENS || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) jit.AddConstant(MakeJitConstant("MULTI_TOKENS_PROCESSING", 1)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 6ea8d85527d19d..492e86ebcce5cc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -93,7 +93,7 @@ struct sdpa_configuration { bool is_paged_attention = false; int64_t paged_attention_aligned_seq_len = -1; int64_t paged_attention_block_size = 0; - bool has_scale_val = false; + bool has_const_scale_val = false; float scale_val = 0.f; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 2f0174d0a45912..6942e5f8ea4357 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -180,9 +180,11 @@ JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t ke jit.AddConstant(MakeJitConstant("HAS_ALIBI", 1)); } - if (params.conf.has_scale_val) { + if (params.conf.has_const_scale_val) { jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", 1.0f / params.conf.scale_val)); jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", params.conf.scale_val)); + } else { + jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", 1)); } } else if (params.inputs.size() <= 4) { jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", std::sqrt(static_cast(params.conf.head_size)))); diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp index e4e7dcb77e03fb..7425b096b6d324 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp @@ -50,9 +50,12 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared const size_t alibi_idx = 11; std::shared_ptr scale_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(scale_idx)); - OPENVINO_ASSERT(scale_const != nullptr); - OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1); - prim.scale_val = scale_const->cast_vector()[0]; + if (scale_const) { + OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1); + prim.scale_val = scale_const->cast_vector()[0]; + } else { + prim.scale_val = cldnn::optional_value(); + } std::shared_ptr alibi_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(alibi_idx)); OPENVINO_ASSERT(alibi_const != nullptr); From d96bd7d9525e0acf40e6af7cd158d79aec7b52d5 Mon Sep 17 00:00:00 2001 From: Pawel Raasz Date: Thu, 24 Oct 2024 12:24:47 +0200 Subject: [PATCH 011/233] [Transformations] Fix vector subscript out of range in transformations (#27180) ### Details: - Fix issue reported by MSVC `Assertion failed: vector subscript out of range` by skip accessing not existing parameters in `RemoveMultiSubGraphOpDanglingParamsResults` transformation. ### Tickets: - CVS-155258 --- ...emove_multi_subgraph_op_dangling_params.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp b/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp index f0de55f4028c94..d86b4b71f102c7 100644 --- a/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp @@ -178,19 +178,21 @@ bool ov::pass::RemoveMultiSubGraphOpDanglingParamsResults::run_on_model(const st if (std::count(std::begin(to_remove_descriptors_indexes[body_idx]), std::end(to_remove_descriptors_indexes[body_idx]), desc_idx) > 0) { - auto& body_param = body_params[body_in_descriptors[desc_idx]->m_body_parameter_index]; - body_func->remove_parameter(body_param); - // Move all body indexes which are after these indicated by to_remove_descriptors_indexes - update_body_param_desc(body_in_descriptors, - body_in_descriptors[desc_idx]->m_body_parameter_index); + if (body_in_descriptors[desc_idx]->m_body_parameter_index < body_params.size()) { + auto& body_param = body_params[body_in_descriptors[desc_idx]->m_body_parameter_index]; + body_func->remove_parameter(body_param); + // Move all body indexes which are after these indicated by to_remove_descriptors_indexes + update_body_param_desc(body_in_descriptors, + body_in_descriptors[desc_idx]->m_body_parameter_index); + } // remove dangling input of MultiSubGraphOp which was not removed earlier auto current_input_idx = body_in_descriptors[desc_idx]->m_input_index; - auto& current_input = op_inputs[current_input_idx]; // the same input tensor can go to different input ports - if (std::count(std::begin(required_inputs_indices), + if (current_input_idx < op_inputs.size() && + std::count(std::begin(required_inputs_indices), std::end(required_inputs_indices), current_input_idx) == 0 && - std::count(std::begin(op_inputs), std::end(op_inputs), current_input) > 0) { + std::count(std::begin(op_inputs), std::end(op_inputs), op_inputs[current_input_idx]) > 0) { op_inputs.erase(std::next(op_inputs.begin(), current_input_idx)); // Move all input indexes (in all bodies) which are after these indicated by // to_remove_descriptors_indexes and are not used in any body From ea042128cb9f02a21603bdb79357dc3b7f9b864a Mon Sep 17 00:00:00 2001 From: Tomasz Jankowski Date: Thu, 24 Oct 2024 13:45:43 +0200 Subject: [PATCH 012/233] [Core/Ref] Multiclass NMS: Fix vector initialization (#27157) ### Details: - `std::vector` initialized with actual size needed. ### Tickets: - CVS-155037 --- src/core/reference/src/op/multiclass_nms.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/reference/src/op/multiclass_nms.cpp b/src/core/reference/src/op/multiclass_nms.cpp index 3f0b6c8634ea9d..88d8d405c618af 100644 --- a/src/core/reference/src/op/multiclass_nms.cpp +++ b/src/core/reference/src/op/multiclass_nms.cpp @@ -48,7 +48,6 @@ static float intersectionOverUnion(const Rectangle& boxI, const Rectangle& boxJ, // start: start index along axis "M" template std::vector slice_image(const T* data, const Shape& data_shape, const int64_t start, const int64_t item_num) { - std::vector slice_data; const auto class_num = data_shape[0]; const auto item_size = (data_shape.size() == 3) ? data_shape[2] : 1; @@ -57,7 +56,7 @@ std::vector slice_image(const T* data, const Shape& data_shape, const int64_t "Invaid inputs as it is trying to slice data out of range."); const auto row_num = item_num * item_size; - slice_data.reserve(class_num * row_num); + std::vector slice_data(static_cast(class_num * row_num)); T* item_data = slice_data.data(); T* src = const_cast(data + start * item_size); for (size_t i = 0; i < class_num; i++) { From 6a8a69c621d8b5b4c163ebd395b10cfe946f851f Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Thu, 24 Oct 2024 15:14:25 +0200 Subject: [PATCH 013/233] [Pytorch fronted]: Added support for Search Sorted op (#26976) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Details: - Added support for SearchSorted op with unittest. ### Tickets: - *[CVS-154559](https://jira.devtools.intel.com/browse/CVS-154559)* Depends on: - https://github.com/openvinotoolkit/openvino/pull/26958 - https://github.com/openvinotoolkit/openvino/pull/27036 --------- Signed-off-by: Kazantsev, Roman Signed-off-by: dependabot[bot] Co-authored-by: Michal Lukaszewski Co-authored-by: Pawel Raasz Co-authored-by: Andrey Babushkin Co-authored-by: Alicja Miloszewska Co-authored-by: Bogdan Pereanu Co-authored-by: Karol Blaszczak Co-authored-by: Tatiana Savina Co-authored-by: Anastasiya(Asya) Pronina Co-authored-by: Dmitry Matveev Co-authored-by: Andrei Beleiu Co-authored-by: Andrew Kwangwoong Park Co-authored-by: Roman Kazantsev Co-authored-by: Pavel Durandin Co-authored-by: Alexey Smirnov Co-authored-by: Hubert Błaszczyk <56601011+hub-bla@users.noreply.github.com> Co-authored-by: Vladimir Paramuzov Co-authored-by: Sergey Shlyapnikov Co-authored-by: Ivan Tikhonov Co-authored-by: Andrzej Kopytko Co-authored-by: Sebastian Golebiewski Co-authored-by: Alina Kladieva Co-authored-by: Ilya Lavrenov Co-authored-by: Maxim Vafin Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mateusz Mikolajczyk --- .../pytorch/src/op/search_sorted.cpp | 34 ++++++++++++++ src/frontends/pytorch/src/op_table.cpp | 2 + .../pytorch_tests/test_search_sorted.py | 47 +++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 src/frontends/pytorch/src/op/search_sorted.cpp create mode 100644 tests/layer_tests/pytorch_tests/test_search_sorted.py diff --git a/src/frontends/pytorch/src/op/search_sorted.cpp b/src/frontends/pytorch/src/op/search_sorted.cpp new file mode 100644 index 00000000000000..ca9f6b49ff7bf9 --- /dev/null +++ b/src/frontends/pytorch/src/op/search_sorted.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/search_sorted.hpp" + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_search_sorted(const NodeContext& context) { + num_inputs_check(context, 2, 5); + Output sorted; + Output values; + std::tie(sorted, values) = get_inputs_with_promoted_types(context, 0, 1); + const bool out_int32 = context.const_input(2); + PYTORCH_OP_CONVERSION_CHECK(out_int32 == false, "aten::searchsorted(out_int32=true) unsupported"); + const bool right_mode = context.const_input(3); + PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(4), "aten::searchsorted(side) unsupported"); + PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(5), "aten::searchsorted(out) unsupported"); + PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(6), "aten::searchsorted(sorter) unsupported"); + auto op = context.mark_node(std::make_shared(sorted, values, right_mode)); + return {op}; +}; +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 195977432e40e5..66c76e33032ef6 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -300,6 +300,7 @@ OP_CONVERTER(translate_reshape_fx); OP_CONVERTER(translate_rsub_fx); OP_CONVERTER(translate_scalar_tensor_fx); OP_CONVERTER(translate_scaled_dot_product_attention_fx); +OP_CONVERTER(translate_search_sorted); OP_CONVERTER(translate_select_scatter_fx); OP_CONVERTER(translate_slice_fx); OP_CONVERTER(translate_slice_scatter_fx); @@ -617,6 +618,7 @@ const std::unordered_map get_supported_ops_ts() { {"aten::rsqrt", op::optional_out}, {"aten::rsqrt_", op::inplace_op}, {"aten::rsub", op::translate_rsub}, + {"aten::searchsorted", op::translate_search_sorted}, {"aten::ScalarImplicit", op::skip_node}, {"aten::scaled_dot_product_attention", op::translate_scaled_dot_product_attention}, {"aten::scatter", op::translate_scatter}, diff --git a/tests/layer_tests/pytorch_tests/test_search_sorted.py b/tests/layer_tests/pytorch_tests/test_search_sorted.py new file mode 100644 index 00000000000000..645033e2ee260b --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_search_sorted.py @@ -0,0 +1,47 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from pytorch_layer_test_class import PytorchLayerTest +import numpy as np + + +class TestSearchSorted(PytorchLayerTest): + def _prepare_input(self): + return (np.array(self.sorted).astype(self.sorted_type),np.array(self.values).astype(self.values_type)) + + def create_model(self, right_mode): + import torch + + class aten_searchsorted(torch.nn.Module): + def __init__(self, right_mode): + super(aten_searchsorted, self).__init__() + self.right_mode = right_mode + + def forward(self, sorted, values): + return torch.searchsorted(sorted, values, right=self.right_mode) + + ref_net = None + + return aten_searchsorted(right_mode), ref_net, "aten::searchsorted" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize(("sorted", "values"), [ + ([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], [[3, 6, 9], [3, 6, 9]]), + ([1, 3, 5, 7, 9], [[3, 6, 9],[0, 5, 20]]), + ([4091, 4092], [[4091, 4092]]), # fp16 cannot exactly represent 4091 number + ([1.23, 2.99], [[1.355, 2.9991]]) + ]) + @pytest.mark.parametrize("right_mode", [False, True]) + @pytest.mark.parametrize("sorted_type", [np.float32, np.float16, np.int8]) + @pytest.mark.parametrize("values_type", [np.float16, np.int32, np.int64]) + def test_searchsorted(self, sorted, values, right_mode, sorted_type, values_type, ie_device, precision, ir_version): + self.sorted = sorted + self.values = values + self.sorted_type = sorted_type + self.values_type = values_type + if ie_device == "CPU" and sorted_type == np.float16 and sorted == [4091, 4092]: + pytest.skip(reason="CPU plugin on defult converts fp16 to fp32, if that happens the test will fail for those malicious values") + self._test(*self.create_model(right_mode), ie_device, precision, ir_version) From 3c0005ea669c77173eef491b1d10b25ef2c9bdcc Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Thu, 24 Oct 2024 15:58:00 +0200 Subject: [PATCH 014/233] [DOCS] Adding GenAI Use Cases (#27062) Creating an article with use case scenarios for using OpenVINO GenAI. This PR addresses the following JIRA ticket: CVS-153319 --------- Co-authored-by: Karol Blaszczak --- .../llm_inference_guide/genai-guide.rst | 54 +-- .../genai-guide/genai-use-cases.rst | 433 ++++++++++++++++++ 2 files changed, 434 insertions(+), 53 deletions(-) create mode 100644 docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index f1fd002b48072e..ebd4667d544616 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -9,6 +9,7 @@ Run LLM Inference on OpenVINO with the GenAI Flavor :hidden: NPU inference of LLMs + genai-guide/genai-use-cases This guide will show you how to integrate the OpenVINO GenAI flavor into your application, covering @@ -174,59 +175,6 @@ You can also create your custom streamer for more sophisticated processing: pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(100)); } -Using GenAI in Chat Scenario -################################ - -For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache across inputs -may prove beneficial. The chat-specific methods **start_chat** and **finish_chat** are used to -mark a conversation session, as you can see in these simple examples: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(model_path) - - pipe.set_generation_config({'max_new_tokens': 100) - - pipe.start_chat() - while True: - print('question:') - prompt = input() - if prompt == 'Stop!': - break - print(pipe.generate(prompt)) - pipe.finish_chat() - - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - int main(int argc, char* argv[]) { - std::string prompt; - - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); - - ov::genai::GenerationConfig config = pipe.get_generation_config(); - config.max_new_tokens = 100; - pipe.set_generation_config(config) - - pipe.start_chat(); - for (size_t i = 0; i < questions.size(); i++) { - std::cout << "question:\n"; - std::getline(std::cin, prompt); - - std::cout << pipe.generate(prompt) << std::endl; - } - pipe.finish_chat(); - } Optimizing Generation with Grouped Beam Search ####################################################### diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst new file mode 100644 index 00000000000000..953784c03fdef0 --- /dev/null +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst @@ -0,0 +1,433 @@ +GenAI Use Cases +===================== + +This article provides several use case scenarios for Generative AI model +inference. The applications presented in the code samples below +only require minimal configuration, like setting an inference device. Feel free +to explore and modify the source code as you need. + + +Using GenAI for Text-to-Image Generation +######################################## + +Examples below demonstrate inference on text-to-image models, like Stable Diffusion +1.5, 2.1, and LCM, with a text prompt as input. The :ref:`main.cpp ` +sample shows basic usage of the ``Text2ImagePipeline`` pipeline. +:ref:`lora.cpp ` shows how to apply LoRA adapters to the pipeline. + + +.. tab-set:: + + .. tab-item:: Python + :sync: python + + .. tab-set:: + + .. tab-item:: main.py + :name: mainpy + + .. code-block:: python + + import openvino_genai + from PIL import Image + import numpy as np + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def infer(model_dir: str, prompt: str): + device = 'CPU' # GPU can be used as well + random_generator = Generator(42) + pipe = openvino_genai.Text2ImagePipeline(model_dir, device) + image_tensor = pipe.generate( + prompt, + width=512, + height=512, + num_inference_steps=20, + num_images_per_prompt=1, + random_generator=random_generator + ) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + .. tab-item:: LoRA.py + :name: lorapy + + .. code-block:: python + + import openvino as ov + import openvino_genai + import numpy as np + import sys + + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def image_write(path: str, image_tensor: ov.Tensor): + from PIL import Image + image = Image.fromarray(image_tensor.data[0]) + image.save(path) + + + def infer(models_path: str, prompt: str): + prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + device = "CPU" # GPU, NPU can be used as well + adapter_config = openvino_genai.AdapterConfig() + + for i in range(int(len(adapters) / 2)): + adapter = openvino_genai.Adapter(adapters[2 * i]) + alpha = float(adapters[2 * i + 1]) + adapter_config.add(adapter, alpha) + + pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) + print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") + image = pipe.generate(prompt, + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20) + + image_write("lora.bmp", image) + print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") + image = pipe.generate(prompt, + adapters=openvino_genai.AdapterConfig(), + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20 + ) + image_write("baseline.bmp", image) + + For more information, refer to the + `Python sample `__ + + .. tab-item:: C++ + :sync: cpp + + .. tab-set:: + + .. tab-item:: main.cpp + :name: maincpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + imwrite("image_%d.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + .. tab-item:: LoRA.cpp + :name: loracpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } + + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); + + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); + + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + + + + + +Using GenAI in Speech Recognition +################################# + + +The application, shown in code samples below, performs inference on speech +recognition Whisper Models. The samples include the ``WhisperPipeline`` class +and use audio files in WAV format at a sampling rate of 16 kHz as input. + +.. tab-set:: + + .. tab-item:: Python + :sync: cpp + + .. code-block:: python + + import openvino_genai + import librosa + + + def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + + + def infer(model_dir: str, wav_file_path: str): + raw_speech = read_wav(wav_file_path) + pipe = openvino_genai.WhisperPipeline(model_dir) + + def streamer(word: str) -> bool: + print(word, end="") + return False + + result = pipe.generate( + raw_speech, + max_new_tokens=100, + language="<|en|>", + task="transcribe", + return_timestamps=True, + streamer=streamer, + ) + + print() + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "audio_utils.hpp" + #include "openvino/genai/whisper_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (3 > argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + } + + std::filesystem::path models_path = argv[1]; + std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU can be used as well + + ov::genai::WhisperPipeline pipeline(models_path, device); + + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); + config.max_new_tokens = 100; + config.language = "<|en|>"; + config.task = "transcribe"; + config.return_timestamps = true; + + auto streamer = [](std::string word) { + std::cout << word; + return false; + }; + + auto result = pipeline.generate(raw_speech, config, streamer); + + std::cout << "\n"; + + for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; + } + + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__. + + +Using GenAI in Chat Scenario +############################ + +For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache across inputs +may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific methods are used to +mark a conversation session, as shown in the samples below: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import openvino_genai + + + def streamer(subword): + print(subword, end='', flush=True) + return False + + + def infer(model_dir: str): + device = 'CPU' # GPU can be used as well. + pipe = openvino_genai.LLMPipeline(model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, config, streamer) + print('\n----------') + pipe.finish_chat() + + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "openvino/genai/llm_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + } + std::string prompt; + std::string models_path = argv[1]; + + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(models_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::function streamer = [](std::string word) { + std::cout << word << std::flush; + return false; + }; + + pipe.start_chat(); + std::cout << "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, config, streamer); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + +Additional Resources +##################### + +* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` +* `OpenVINO GenAI Repo `__ +* `OpenVINO GenAI Samples `__ +* `OpenVINO Tokenizers `__ From 8faeed130b851334163bdfd29546fce81f387986 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Thu, 24 Oct 2024 17:38:35 +0200 Subject: [PATCH 015/233] [PyOV] Extend Python API with SearchSorted-15 (#27230) ### Details: - *[PyOV] Extend Python API with SearchSorted-15* - *...* ### Tickets: - *CVS-155961* --- .../src/openvino/runtime/opset15/__init__.py | 1 + .../src/openvino/runtime/opset15/ops.py | 21 ++++++++++++++ .../python/tests/test_graph/test_create_op.py | 28 +++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py index 58fd90e7fd1051..a12225f719a55c 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py @@ -17,3 +17,4 @@ from openvino.runtime.opset15.ops import bitwise_right_shift from openvino.runtime.opset15.ops import slice_scatter from openvino.runtime.opset15.ops import stft +from openvino.runtime.opset15.ops import search_sorted diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py index c278120dab7432..45b01a11bc3588 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py @@ -327,3 +327,24 @@ def stft( """ inputs = as_nodes(data, window, frame_size, frame_step, name=name) return _get_node_factory_opset15().create("STFT", inputs) + + +@nameable_op +def search_sorted( + sorted_sequence: NodeInput, + values: NodeInput, + right_mode: bool = False, + name: Optional[str] = None, +) -> Node: + """Return a node which generates SearchSorted operation. + + :param sorted_sequence: The node providing sorted sequence to search in. + :param values: The node providing searched values. + :param right_mode: If set to False, return the first suitable index that is found for given value. + If set to True, return the last such index. Defaults to False. + :param name: The optional name for the created output node. + :return: The new node performing SearchSorted operation. + """ + inputs = as_nodes(sorted_sequence, values, name=name) + attributes = {"right_mode": right_mode} + return _get_node_factory_opset15().create("SearchSorted", inputs, attributes) diff --git a/src/bindings/python/tests/test_graph/test_create_op.py b/src/bindings/python/tests/test_graph/test_create_op.py index 940f8244f427b8..87787e1e29bc32 100644 --- a/src/bindings/python/tests/test_graph/test_create_op.py +++ b/src/bindings/python/tests/test_graph/test_create_op.py @@ -2502,6 +2502,34 @@ def test_stft(): assert op.get_output_shape(0) == [4, 13, 6, 2] +def test_search_sorted(): + sorted_sequence = ov.parameter([7, 256, 200, 200], name="sorted", dtype=np.float32) + values = ov.parameter([7, 256, 200, 10], name="values", dtype=np.float32) + op = ov_opset15.search_sorted(sorted_sequence=sorted_sequence, values=values, name="default") + assert op.get_type_name() == "SearchSorted" + assert op.get_output_size() == 1 + assert op.get_output_element_type(0) == Type.i64 + assert op.get_output_shape(0) == [7, 256, 200, 10] + assert op.get_attributes()["right_mode"] is False + assert op.get_friendly_name() == "default" + + op = ov_opset15.search_sorted(sorted_sequence, values, right_mode=True, name="right") + assert op.get_type_name() == "SearchSorted" + assert op.get_output_size() == 1 + assert op.get_output_element_type(0) == Type.i64 + assert op.get_output_shape(0) == [7, 256, 200, 10] + assert op.get_attributes()["right_mode"] is True + assert op.get_friendly_name() == "right" + + op = ov_opset15.search_sorted(sorted_sequence, values, False, name="left") + assert op.get_type_name() == "SearchSorted" + assert op.get_output_size() == 1 + assert op.get_output_element_type(0) == Type.i64 + assert op.get_output_shape(0) == [7, 256, 200, 10] + assert op.get_attributes()["right_mode"] is False + assert op.get_friendly_name() == "left" + + def test_parameter_get_attributes(): parameter = ov.parameter([2, 2], dtype=np.float32, name="InputData") parameter_attributes = parameter.get_attributes() From 104fe03ccb0ee800e79aa49de17b19858f8aa800 Mon Sep 17 00:00:00 2001 From: Maksim Doronin Date: Thu, 24 Oct 2024 18:23:19 +0100 Subject: [PATCH 016/233] Add opencv config (#27164) ### Details: - OpenCV is NPU dependency only for single-image-test and protopipe. Since both NPU tools has moved to OpenVINO, for easier synchronization, we need to version OpenCV in OpenVINO NPU project ### Tickets: - E-143100 --- src/plugins/intel_npu/tools/opencv_version.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/plugins/intel_npu/tools/opencv_version.json diff --git a/src/plugins/intel_npu/tools/opencv_version.json b/src/plugins/intel_npu/tools/opencv_version.json new file mode 100644 index 00000000000000..3f51608bf4281b --- /dev/null +++ b/src/plugins/intel_npu/tools/opencv_version.json @@ -0,0 +1,3 @@ +{ + "opencv" : "3919f33e21fd0783f67901ad3429101f9b39c798" +} From 9b377f4f857fe7aa401bb7cce3bcdc589d3cd4c7 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Thu, 24 Oct 2024 21:39:41 +0200 Subject: [PATCH 017/233] [TESTS] Remove unused requirements.txt (#27235) ### Details: - *requirements.txt from pytorch hub tests are unused after #26856 * ### Tickets: - *CVS-155878* --- .../model_hub_tests/pytorch/requirements.txt | 42 ------------------- .../pytorch/requirements_secondary.txt | 3 -- 2 files changed, 45 deletions(-) delete mode 100644 tests/model_hub_tests/pytorch/requirements.txt delete mode 100644 tests/model_hub_tests/pytorch/requirements_secondary.txt diff --git a/tests/model_hub_tests/pytorch/requirements.txt b/tests/model_hub_tests/pytorch/requirements.txt deleted file mode 100644 index 18f95791368d48..00000000000000 --- a/tests/model_hub_tests/pytorch/requirements.txt +++ /dev/null @@ -1,42 +0,0 @@ --c ../../constraints.txt -auto-gptq>=0.5.1 -av -basicsr -datasets -easyocr -facexlib -numpy -librosa -optimum -packaging -pandas -protobuf -pyctcdecode -pytest -pytest-html -sacremoses -sentencepiece -soundfile -super-image -timm -torch -torchaudio -torchvision -transformers -wheel -PyYAML -kornia - -# use latest released version once it's available -git+https://github.com/huggingface/optimum-intel.git@main -# set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer -hf_transfer - -# requirements for specific models -# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM -rjieba - -# - katuni4ka/tiny-random-qwen -# - katuni4ka/tiny-random-internlm2 -transformers_stream_generator -einops diff --git a/tests/model_hub_tests/pytorch/requirements_secondary.txt b/tests/model_hub_tests/pytorch/requirements_secondary.txt deleted file mode 100644 index 634f481dfc9269..00000000000000 --- a/tests/model_hub_tests/pytorch/requirements_secondary.txt +++ /dev/null @@ -1,3 +0,0 @@ --c ../../constraints.txt -# This file contains requirements dependednt from modules in requirements.txt -natten -f https://shi-labs.com/natten/wheels/cpu/torch2.0.0/index.html From 433e44e37d3df3ee2db1ebbf2cbf0651ad86f6f6 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Thu, 24 Oct 2024 20:56:32 +0100 Subject: [PATCH 018/233] [NPUW] Add Slice before last MatMul (#27229) Based on https://github.com/openvinotoolkit/openvino.genai/pull/814 --- .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 8 + .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../src/plugin/npuw/compiled_model.cpp | 11 ++ .../plugin/npuw/partitioning/patterns/opt.cpp | 158 ++++++++++++++++-- .../plugin/npuw/partitioning/patterns/opt.hpp | 21 +++ 6 files changed, 188 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 3eb7d3df218b41..7b0dab3d16da3c 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -43,6 +43,7 @@ DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime); DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime); +DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime); DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime); DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index a3eb4ecfa8cb63..5d6c6da22eb994 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -194,6 +194,14 @@ static constexpr ov::Property dyn_quant{"NPUW_DQ"}; */ static constexpr ov::Property par_matmul_merge_dims{"NPUW_PMM"}; +/** + * @brief + * Type: bool. + * Add Slice before the last MatMul reducing output's dimention. + * Default value: false. + */ +static constexpr ov::Property slice_out{"NPUW_SLICE_OUT"}; + /** * @brief * Type: boolean. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 3b108c2068b70d..6a519a0f754a32 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -28,6 +28,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index c6ef93ff1044be..69d68e020b887b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -146,6 +146,16 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, rewr.run_on_model(model); } + if (m_cfg.get<::intel_npu::NPUW_SLICE_OUT>()) { + // Add Slice before last MatMul for the prefill model + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + rewr.add_matcher(); + rewr.add_matcher(); + rewr.add_matcher(); + rewr.run_on_model(model); + } + auto partitioning = getPartitioning(model, m_cfg); m_total_stat.gflops = partitioning.total_gflops; m_total_stat.ops = partitioning.total_ops; @@ -906,6 +916,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::cwai, NPUW_CWAI), BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), BIND(npuw::partitioning::spatial, NPUW_SPATIAL), BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN), diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index ddf1449adb9d59..6040e1e112e894 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -6,18 +6,7 @@ #include "../../logging.hpp" #include "../../util.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/matmul.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reduce_sum.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/split.hpp" -#include "openvino/op/subtract.hpp" +#include "openvino/op/ops.hpp" #include "openvino/op/util/op_types.hpp" #include "openvino/pass/pattern/op/label.hpp" // any_input #include "openvino/pass/pattern/op/optional.hpp" @@ -1296,6 +1285,151 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { register_matcher(std::make_shared(res, "OptCompressDictMatMulf32"), std::move(callback)); } +SliceLastMatmul::SliceLastMatmul() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto res = opp::wrap_type({matmul}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmul"), std::move(callback)); +} + +SliceLastMatmulAdd::SliceLastMatmulAdd() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto add = opp::wrap_type({matmul, opp::any_input()}); + auto res = opp::wrap_type({add}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulAdd"), std::move(callback)); +} + +SliceLastMatmulTranspose::SliceLastMatmulTranspose() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto add = opp::wrap_type({matmul, opp::any_input()}); + auto res = opp::wrap_type({matmul}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulTranspose"), std::move(callback)); +} + +SliceLastMatmulMultiply::SliceLastMatmulMultiply() { + auto matmul = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto div = opp::wrap_type({matmul, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto multiply = opp::wrap_type({tanh, opp::any_input()}); + auto res = opp::wrap_type({multiply}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_out_matmul = node_to_output.at(matmul); + + auto shape = matched_out_matmul.get_node()->input(0).get_shape(); + + if (shape.size() == 3 && shape[1] > 1) { + auto start = std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{0, int32_t(shape[1] - 1), 0}); + auto stop = + std::make_shared(ov::element::i32, + ov::Shape{3}, + std::vector{1, int32_t(shape[1]), int32_t(shape[2])}); + auto step = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 1}); + + auto slice = + std::make_shared(matched_out_matmul.get_node()->input_value(0), start, stop, step); + + matched_out_matmul.get_node()->input(0).replace_source_output(slice); + + return true; // root was changed + } + return false; // root hasn't changed + }; + register_matcher(std::make_shared(res, "SliceLastMatmulMultiply"), std::move(callback)); +} + } // namespace opt } // namespace patterns } // namespace npuw diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index b649f6a136c2e7..a66012d4a85fb8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -149,6 +149,27 @@ class CompressDictMatMulf32 : public ov::pass::MatcherPass { CompressDictMatMulf32(Context::Ref ctx); }; +// Slice last Matmul +class SliceLastMatmul : public ov::pass::MatcherPass { +public: + SliceLastMatmul(); +}; + +class SliceLastMatmulAdd : public ov::pass::MatcherPass { +public: + SliceLastMatmulAdd(); +}; + +class SliceLastMatmulTranspose : public ov::pass::MatcherPass { +public: + SliceLastMatmulTranspose(); +}; + +class SliceLastMatmulMultiply : public ov::pass::MatcherPass { +public: + SliceLastMatmulMultiply(); +}; + } // namespace opt } // namespace patterns } // namespace npuw From 79b0baddd05652b88092fccd9cb1c14846ea77a3 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 25 Oct 2024 03:30:54 +0400 Subject: [PATCH 019/233] Fixed vcpkg Android ARM64 build with ONNX 1.16.2 (#27217) Replacement for https://github.com/openvinotoolkit/openvino/pull/27171 JIRA CVS-155558 --- conan.lock | 2 +- conanfile.txt | 2 +- src/frontends/onnx/frontend/src/core/tensor.cpp | 14 -------------- src/frontends/onnx/frontend/src/core/tensor.hpp | 17 ----------------- src/frontends/onnx/frontend/src/frontend.cpp | 8 ++++++-- .../onnx/frontend/src/utils/common.cpp | 10 ---------- src/frontends/onnx/onnx_common/CMakeLists.txt | 15 --------------- src/frontends/onnx/onnx_common/src/utils.cpp | 8 -------- src/frontends/onnx/tests/CMakeLists.txt | 15 --------------- src/frontends/onnx/tests/onnx_import.in.cpp | 2 -- src/frontends/paddle/src/frontend.cpp | 8 ++++++-- thirdparty/dependencies.cmake | 14 ++++++++++++-- vcpkg.json | 4 ++-- 13 files changed, 28 insertions(+), 91 deletions(-) diff --git a/conan.lock b/conan.lock index a21a2a8d7b52f8..f0cf4c64529cfc 100644 --- a/conan.lock +++ b/conan.lock @@ -10,7 +10,7 @@ "opencl-icd-loader/2023.04.17#5f73dd9f0c023d416a7f162e320b9c77%1692732261.088", "opencl-headers/2023.04.17#3d98f2d12a67c2400de6f11d5335b5a6%1683936272.16", "opencl-clhpp-headers/2023.04.17#7c62fcc7ac2559d4839150d2ebaac5c8%1685450803.672", - "onnx/1.16.0#4d2d4f24d6f73b8a7551e001839631f0%1712404811.278", + "onnx/1.16.2#b5e8d35b10d454b26751762922465eb8%1712404811.278", "onetbb/2021.10.0#cbb2fc43088070b48f6e4339bc8fa0e1%1693812561.235", "ittapi/3.24.0#9246125f13e7686dee2b0c992b71db94%1682969872.743", "hwloc/2.9.2#1c63e2eccac57048ae226e6c946ebf0e%1688677682.002", diff --git a/conanfile.txt b/conanfile.txt index f124179d52bf12..b35a8bda22543a 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -7,7 +7,7 @@ opencl-icd-loader/[>=2023.04.17] rapidjson/[>=1.1.0] xbyak/[>=6.62] snappy/[>=1.1.7] -onnx/1.16.0 +onnx/1.16.2 pybind11/[>=2.12.0] flatbuffers/[>=22.9.24] diff --git a/src/frontends/onnx/frontend/src/core/tensor.cpp b/src/frontends/onnx/frontend/src/core/tensor.cpp index 4f8f54e2d83690..b23f6c55253ac1 100644 --- a/src/frontends/onnx/frontend/src/core/tensor.cpp +++ b/src/frontends/onnx/frontend/src/core/tensor.cpp @@ -82,18 +82,11 @@ std::vector Tensor::get_data() const { if (m_tensor_proto->has_raw_data()) { return detail::__get_raw_data(m_tensor_proto->raw_data(), m_tensor_proto->data_type()); } -#ifdef ONNX_VERSION_116 if (m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_INT8 || m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_INT4) { return detail::__get_data(m_tensor_proto->int32_data()); } ONNX_INVALID_DATA_TYPE(m_tensor_proto->data_type(), "INT4, INT8, raw data"); -#else - if (m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_INT8) { - return detail::__get_data(m_tensor_proto->int32_data()); - } - ONNX_INVALID_DATA_TYPE(m_tensor_proto->data_type(), "INT8, raw data"); -#endif } template <> @@ -146,18 +139,11 @@ std::vector Tensor::get_data() const { if (m_tensor_proto->has_raw_data()) { return detail::__get_raw_data(m_tensor_proto->raw_data(), m_tensor_proto->data_type()); } -#ifdef ONNX_VERSION_116 if (m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_UINT8 || m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_UINT4) { return detail::__get_data(m_tensor_proto->int32_data()); } ONNX_INVALID_DATA_TYPE(m_tensor_proto->data_type(), "UINT4, UINT8, raw data"); -#else - if (m_tensor_proto->data_type() == TensorProto_DataType::TensorProto_DataType_UINT8) { - return detail::__get_data(m_tensor_proto->int32_data()); - } - ONNX_INVALID_DATA_TYPE(m_tensor_proto->data_type(), "UINT8, raw data"); -#endif } template <> diff --git a/src/frontends/onnx/frontend/src/core/tensor.hpp b/src/frontends/onnx/frontend/src/core/tensor.hpp index ae6fe28754b4e5..af4d299f9d45e7 100644 --- a/src/frontends/onnx/frontend/src/core/tensor.hpp +++ b/src/frontends/onnx/frontend/src/core/tensor.hpp @@ -65,10 +65,8 @@ class Tensor { enum class Type { undefined = TensorProto_DataType::TensorProto_DataType_UNDEFINED, float32 = TensorProto_DataType::TensorProto_DataType_FLOAT, -#ifdef ONNX_VERSION_116 uint4 = TensorProto_DataType::TensorProto_DataType_UINT4, int4 = TensorProto_DataType::TensorProto_DataType_INT4, -#endif uint8 = TensorProto_DataType::TensorProto_DataType_UINT8, int8 = TensorProto_DataType::TensorProto_DataType_INT8, uint16 = TensorProto_DataType::TensorProto_DataType_UINT16, @@ -146,10 +144,8 @@ class Tensor { return ov::element::f16; case TensorProto_DataType::TensorProto_DataType_DOUBLE: return ov::element::f64; -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_INT4: return ov::element::i4; -#endif case TensorProto_DataType::TensorProto_DataType_INT8: return ov::element::i8; case TensorProto_DataType::TensorProto_DataType_INT16: @@ -158,10 +154,8 @@ class Tensor { return ov::element::i32; case TensorProto_DataType::TensorProto_DataType_INT64: return ov::element::i64; -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_UINT4: return ov::element::u4; -#endif case TensorProto_DataType::TensorProto_DataType_UINT8: return ov::element::u8; case TensorProto_DataType::TensorProto_DataType_UINT16: @@ -205,10 +199,8 @@ class Tensor { return make_ov_constant(ov::element::f16); case TensorProto_DataType::TensorProto_DataType_DOUBLE: return make_ov_constant(ov::element::f64); -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_INT4: return make_ov_constant(ov::element::i4); -#endif case TensorProto_DataType::TensorProto_DataType_INT8: return make_ov_constant(ov::element::i8); case TensorProto_DataType::TensorProto_DataType_INT16: @@ -217,10 +209,8 @@ class Tensor { return make_ov_constant(ov::element::i32); case TensorProto_DataType::TensorProto_DataType_INT64: return make_ov_constant(ov::element::i64); -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_UINT4: return make_ov_constant(ov::element::u4); -#endif case TensorProto_DataType::TensorProto_DataType_UINT8: return make_ov_constant(ov::element::u8); case TensorProto_DataType::TensorProto_DataType_UINT16: @@ -238,17 +228,10 @@ class Tensor { case TensorProto_DataType::TensorProto_DataType_STRING: return make_ov_constant(ov::element::string); default: -#ifdef ONNX_VERSION_116 ONNX_UNSUPPORTED_DATA_TYPE( m_tensor_proto->data_type(), "BOOL, BFLOAT16, FLOAT8E4M3FN, FLOAT8E5M2, FLOAT, FLOAT16, DOUBLE, INT4, INT8, INT16, INT32, INT64, " "UINT4, UINT8, UINT16, UINT32, UINT64, STRING"); -#else - ONNX_UNSUPPORTED_DATA_TYPE( - m_tensor_proto->data_type(), - "BOOL, BFLOAT16, FLOAT8E4M3FN, FLOAT8E5M2, FLOAT, FLOAT16, DOUBLE, INT8, INT16, INT32, INT64, " - "UINT8, UINT16, UINT32, UINT64, STRING"); -#endif } } diff --git a/src/frontends/onnx/frontend/src/frontend.cpp b/src/frontends/onnx/frontend/src/frontend.cpp index 8afc9b661ec28d..5ad28be3654422 100644 --- a/src/frontends/onnx/frontend/src/frontend.cpp +++ b/src/frontends/onnx/frontend/src/frontend.cpp @@ -8,7 +8,9 @@ #endif #include -#ifndef OV_PROTOBUF_ABSL_IS_USED +#ifdef OV_PROTOBUF_ABSL_IS_USED +# include +#else # include #endif @@ -47,7 +49,9 @@ ONNX_FRONTEND_C_API void* get_front_end_data() { }; #ifndef OPENVINO_DEBUG_ENABLE // disable protobuf logging -# ifndef OV_PROTOBUF_ABSL_IS_USED +# ifdef OV_PROTOBUF_ABSL_IS_USED + absl::SetGlobalVLogLevel(0); +# else google::protobuf::SetLogHandler(nullptr); # endif #endif diff --git a/src/frontends/onnx/frontend/src/utils/common.cpp b/src/frontends/onnx/frontend/src/utils/common.cpp index 66fdcf1c7830c7..46c7be75bbdd66 100644 --- a/src/frontends/onnx/frontend/src/utils/common.cpp +++ b/src/frontends/onnx/frontend/src/utils/common.cpp @@ -42,10 +42,8 @@ const ov::element::Type& get_ov_element_type(int64_t onnx_type) { return ov::element::f16; case TensorProto_DataType::TensorProto_DataType_FLOAT: return ov::element::f32; -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_INT4: return ov::element::i4; -#endif case TensorProto_DataType::TensorProto_DataType_INT8: return ov::element::i8; case TensorProto_DataType::TensorProto_DataType_INT16: @@ -54,10 +52,8 @@ const ov::element::Type& get_ov_element_type(int64_t onnx_type) { return ov::element::i32; case TensorProto_DataType::TensorProto_DataType_INT64: return ov::element::i64; -#ifdef ONNX_VERSION_116 case TensorProto_DataType::TensorProto_DataType_UINT4: return ov::element::u4; -#endif case TensorProto_DataType::TensorProto_DataType_UINT8: return ov::element::u8; case TensorProto_DataType::TensorProto_DataType_UINT16: @@ -77,15 +73,9 @@ const ov::element::Type& get_ov_element_type(int64_t onnx_type) { case TensorProto_DataType::TensorProto_DataType_STRING: return ov::element::string; } -#ifdef ONNX_VERSION_116 ONNX_UNSUPPORTED_DATA_TYPE(onnx_type, "BOOL, BFLOAT16, FLOAT8E4M3FN, FLOAT8E5M2, FLOAT, FLOAT16, DOUBLE, INT4, INT8, INT16, " "INT32, INT64, UINT4, UINT8, UINT16, UINT32, UINT64, STRING, UNDEFINED"); -#else - ONNX_UNSUPPORTED_DATA_TYPE(onnx_type, - "BOOL, BFLOAT16, FLOAT8E4M3FN, FLOAT8E5M2, FLOAT, FLOAT16, DOUBLE, INT8, INT16, " - "INT32, INT64, UINT8, UINT16, UINT32, UINT64, STRING, UNDEFINED"); -#endif } void default_op_checks(const Node& node, size_t min_inputs_size) { diff --git a/src/frontends/onnx/onnx_common/CMakeLists.txt b/src/frontends/onnx/onnx_common/CMakeLists.txt index a743c5ac40a0dd..d63bce4083087c 100644 --- a/src/frontends/onnx/onnx_common/CMakeLists.txt +++ b/src/frontends/onnx/onnx_common/CMakeLists.txt @@ -35,18 +35,3 @@ ov_link_system_libraries(${TARGET_NAME} PUBLIC onnx_proto onnx) ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) ov_install_static_lib(${TARGET_NAME} ${OV_CPACK_COMP_CORE}) - -# Temporary solution until vcpkg doesn't have fresh ONNX, -# trying determine used version of ONNX to enable modern functionality -find_package(ONNX 1.16.0 QUIET COMPONENTS onnx onnx_proto NO_MODULE) -if(ONNX_FOUND) - target_compile_definitions(${TARGET_NAME} PUBLIC ONNX_VERSION_116) -else() - if(EXISTS "${CMAKE_SOURCE_DIR}/thirdparty/onnx/onnx/VERSION_NUMBER") - file(READ "${CMAKE_SOURCE_DIR}/thirdparty/onnx/onnx/VERSION_NUMBER" ONNX_VERSION) - string(STRIP "${ONNX_VERSION}" ONNX_VERSION) - if((ONNX_VERSION GREATER "1.16.0") OR (ONNX_VERSION EQUAL "1.16.0")) - target_compile_definitions(${TARGET_NAME} PUBLIC ONNX_VERSION_116) - endif() - endif() -endif() diff --git a/src/frontends/onnx/onnx_common/src/utils.cpp b/src/frontends/onnx/onnx_common/src/utils.cpp index 6ec409c5671458..b83dea1b4cfd99 100644 --- a/src/frontends/onnx/onnx_common/src/utils.cpp +++ b/src/frontends/onnx/onnx_common/src/utils.cpp @@ -30,10 +30,8 @@ size_t get_onnx_data_size(int32_t onnx_type) { return sizeof(ov::float8_e4m3); case TensorProto_DataType_FLOAT8E5M2: return sizeof(ov::float8_e5m2); -#ifdef ONNX_VERSION_116 case TensorProto_DataType_INT4: return sizeof(int8_t); -#endif case TensorProto_DataType_INT8: return sizeof(int8_t); case TensorProto_DataType_INT16: @@ -42,10 +40,8 @@ size_t get_onnx_data_size(int32_t onnx_type) { return sizeof(int32_t); case TensorProto_DataType_INT64: return sizeof(int64_t); -#ifdef ONNX_VERSION_116 case TensorProto_DataType_UINT4: return sizeof(uint8_t); -#endif case TensorProto_DataType_UINT8: return sizeof(uint8_t); case TensorProto_DataType_UINT16: @@ -66,16 +62,12 @@ const std::map OV_2_ONNX_TYPES = { {ov::element::Type_t::f16, TensorProto_DataType::TensorProto_DataType_FLOAT16}, {ov::element::Type_t::f32, TensorProto_DataType::TensorProto_DataType_FLOAT}, {ov::element::Type_t::f64, TensorProto_DataType::TensorProto_DataType_DOUBLE}, -#ifdef ONNX_VERSION_116 {ov::element::Type_t::i4, TensorProto_DataType::TensorProto_DataType_INT4}, -#endif {ov::element::Type_t::i8, TensorProto_DataType::TensorProto_DataType_INT8}, {ov::element::Type_t::i16, TensorProto_DataType::TensorProto_DataType_INT16}, {ov::element::Type_t::i32, TensorProto_DataType::TensorProto_DataType_INT32}, {ov::element::Type_t::i64, TensorProto_DataType::TensorProto_DataType_INT64}, -#ifdef ONNX_VERSION_116 {ov::element::Type_t::u4, TensorProto_DataType::TensorProto_DataType_UINT4}, -#endif {ov::element::Type_t::u8, TensorProto_DataType::TensorProto_DataType_UINT8}, {ov::element::Type_t::u16, TensorProto_DataType::TensorProto_DataType_UINT16}, {ov::element::Type_t::u32, TensorProto_DataType::TensorProto_DataType_UINT32}, diff --git a/src/frontends/onnx/tests/CMakeLists.txt b/src/frontends/onnx/tests/CMakeLists.txt index 9b928773b7d65a..f508fdb4c1a903 100644 --- a/src/frontends/onnx/tests/CMakeLists.txt +++ b/src/frontends/onnx/tests/CMakeLists.txt @@ -134,21 +134,6 @@ target_compile_definitions(ov_onnx_frontend_tests set(ONNX_OPSET_VERSION 17 CACHE INTERNAL "Supported version of ONNX operator set") target_compile_definitions(ov_onnx_frontend_tests PRIVATE ONNX_OPSET_VERSION=${ONNX_OPSET_VERSION}) -# Temporary solution until vcpkg doesn't have fresh ONNX, -# trying determine used version of ONNX to enable modern functionality -find_package(ONNX 1.16.0 QUIET COMPONENTS onnx onnx_proto NO_MODULE) -if(ONNX_FOUND) - target_compile_definitions(ov_onnx_frontend_tests PRIVATE ONNX_VERSION_116) -else() - if(EXISTS "${CMAKE_SOURCE_DIR}/thirdparty/onnx/onnx/VERSION_NUMBER") - file(READ "${CMAKE_SOURCE_DIR}/thirdparty/onnx/onnx/VERSION_NUMBER" ONNX_VERSION) - string(STRIP "${ONNX_VERSION}" ONNX_VERSION) - if((ONNX_VERSION GREATER "1.16.0") OR (ONNX_VERSION EQUAL "1.16.0")) - target_compile_definitions(ov_onnx_frontend_tests PRIVATE ONNX_VERSION_116) - endif() - endif() -endif() - if(ONNX_TESTS_DEPENDENCIES) add_dependencies(ov_onnx_frontend_tests ${ONNX_TESTS_DEPENDENCIES}) endif() diff --git a/src/frontends/onnx/tests/onnx_import.in.cpp b/src/frontends/onnx/tests/onnx_import.in.cpp index bc27a759d415a0..c57cb2babc569b 100644 --- a/src/frontends/onnx/tests/onnx_import.in.cpp +++ b/src/frontends/onnx/tests/onnx_import.in.cpp @@ -159,7 +159,6 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_bool_init_raw) { test_case.run(); } -#ifdef ONNX_VERSION_116 OPENVINO_TEST(${BACKEND_NAME}, onnx_int4_const) { auto model = convert_model("int4_const.onnx"); @@ -195,7 +194,6 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_uint4_input) { test_case.run(); } -#endif OPENVINO_TEST(${BACKEND_NAME}, onnx_model_add_abc_initializers) { auto model = convert_model("add_abc_initializers.onnx"); diff --git a/src/frontends/paddle/src/frontend.cpp b/src/frontends/paddle/src/frontend.cpp index c6febe08437b5d..163b4d894cb766 100644 --- a/src/frontends/paddle/src/frontend.cpp +++ b/src/frontends/paddle/src/frontend.cpp @@ -10,7 +10,9 @@ #endif #include -#ifndef OV_PROTOBUF_ABSL_IS_USED +#ifdef OV_PROTOBUF_ABSL_IS_USED +# include +#else # include #endif @@ -594,7 +596,9 @@ PADDLE_C_API void* get_front_end_data() { #ifndef OPENVINO_DEBUG_ENABLE // disable protobuf logging -# ifndef OV_PROTOBUF_ABSL_IS_USED +# ifdef OV_PROTOBUF_ABSL_IS_USED + absl::SetGlobalVLogLevel(0); +# else google::protobuf::SetLogHandler(nullptr); # endif #endif diff --git a/thirdparty/dependencies.cmake b/thirdparty/dependencies.cmake index 0e8536a1714a35..c22b06bcf5863c 100644 --- a/thirdparty/dependencies.cmake +++ b/thirdparty/dependencies.cmake @@ -335,7 +335,10 @@ if(ENABLE_OV_PADDLE_FRONTEND OR ENABLE_OV_ONNX_FRONTEND OR ENABLE_OV_TF_FRONTEND # try to find newer version first (major is changed) # see https://protobuf.dev/support/version-support/ and # https://github.com/protocolbuffers/protobuf/commit/d61f75ff6db36b4f9c0765f131f8edc2f86310fa - find_package(Protobuf 4.22.0 QUIET CONFIG) + find_package(Protobuf 5.26.0 QUIET CONFIG) + if(NOT Protobuf_FOUND) + find_package(Protobuf 4.22.0 QUIET CONFIG) + endif() if(Protobuf_FOUND) # protobuf was found via CONFIG mode, let's save it for later usage in OpenVINOConfig.cmake static build set(protobuf_config CONFIG) @@ -500,10 +503,17 @@ endif() # if(ENABLE_OV_ONNX_FRONTEND) - find_package(ONNX 1.15.0 QUIET COMPONENTS onnx onnx_proto NO_MODULE) + find_package(ONNX 1.16.2 QUIET COMPONENTS onnx onnx_proto NO_MODULE) if(ONNX_FOUND) # conan and vcpkg create imported targets 'onnx' and 'onnx_proto' + # newer versions of ONNX in vcpkg has ONNX:: prefix, let's create aliases + if(TARGET ONNX::onnx) + add_library(onnx ALIAS ONNX::onnx) + endif() + if(TARGET ONNX::onnx_proto) + add_library(onnx_proto ALIAS ONNX::onnx_proto) + endif() else() add_subdirectory(thirdparty/onnx) endif() diff --git a/vcpkg.json b/vcpkg.json index 7214195df49506..f867c79f6a0790 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,7 +1,7 @@ { "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json", "name": "openvino", - "version": "2024.0.0", + "version": "2024.5.0", "maintainers": "OpenVINO Developers ", "summary": "This is a port for Open Visual Inference And Optimization toolkit for AI inference", "description": [ @@ -14,7 +14,7 @@ "homepage": "https://github.com/openvinotoolkit/openvino", "documentation": "https://docs.openvino.ai/latest/index.html", "license": "Apache-2.0", - "builtin-baseline": "7ba0ba7334c3346e7eee1e049ba85da193a8d821", + "builtin-baseline": "88a0bf87b5efd6270502dfe4dde75dd155bd992b", "dependencies": [ { "name": "pkgconf", From 70987649c16bd6f8eeb28876521ba54e53bf40c7 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Fri, 25 Oct 2024 09:42:23 +0400 Subject: [PATCH 020/233] [GPU] Use impls cache in case when shape changed (#27226) ### Details: - Allow checking impls cache & running async compile when primitive shape is changed to improve the performance ### Tickets: - *CVS-155869* --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index f90d4e34b08cc2..da56141cee4840 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2505,7 +2505,7 @@ std::shared_ptr ImplementationsFactory::get_primitive_impl_for_p } // 1. If we have static impl in the cache - use it - if (use_async_compilation && inst.get_impl() && inst.get_impl()->is_dynamic()) { + if (use_async_compilation && ((inst.get_impl() && inst.get_impl()->is_dynamic()) || inst.shape_changed())) { auto cached_impl = m_static_impls_cache.get(updated_params); if (cached_impl) { return cached_impl->clone(); From a9dee49482b111cb8f753a110a2f95a997e8c402 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Fri, 25 Oct 2024 05:46:57 +0000 Subject: [PATCH 021/233] [GPU] Weightless caching - propagate weights_path from IR FE to the plugin (#27162) This PR is an implementation of an idea requested in this discussion: https://github.com/openvinotoolkit/openvino/pull/25731#discussion_r1802669119. --- src/core/src/pass/serialize.cpp | 2 +- src/frontends/ir/src/frontend.cpp | 16 ++++++++---- src/frontends/ir/src/input_model.cpp | 25 +++++++++++++------ src/frontends/ir/src/input_model.hpp | 6 +++-- .../include/intel_gpu/plugin/plugin.hpp | 1 + src/plugins/intel_gpu/src/plugin/plugin.cpp | 20 +++++++++++++++ .../tests/functional/behavior/model_cache.cpp | 5 ++-- 7 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp index 3af6d2c4b5313f..f179630b155d22 100644 --- a/src/core/src/pass/serialize.cpp +++ b/src/core/src/pass/serialize.cpp @@ -1173,7 +1173,7 @@ void ngfunction_2_ir(pugi::xml_node& netXml, pugi::xml_node rt_info_node = netXml.append_child("rt_info"); for (const auto& it : model.get_rt_info()) { // Skip IR version - if (it.first == "version") + if (it.first == "version" || it.first == "__weights_path") continue; serialize_rt_info(rt_info_node, it.first, it.second); } diff --git a/src/frontends/ir/src/frontend.cpp b/src/frontends/ir/src/frontend.cpp index db979a35d932af..c5e137e1decc89 100644 --- a/src/frontends/ir/src/frontend.cpp +++ b/src/frontends/ir/src/frontend.cpp @@ -168,15 +168,21 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector& variants) const return exts; }; - auto create_input_model = [&]() -> std::shared_ptr { + auto create_input_model = [&](std::string weights_path) -> std::shared_ptr { if (provided_model_stream) { - return std::make_shared(*provided_model_stream, weights, create_extensions_map()); + return std::make_shared(*provided_model_stream, + weights, + create_extensions_map(), + std::move(weights_path)); } else if (local_model_stream.is_open()) { - auto input_model = std::make_shared(local_model_stream, weights, create_extensions_map()); + auto input_model = std::make_shared(local_model_stream, + weights, + create_extensions_map(), + std::move(weights_path)); local_model_stream.close(); return input_model; } else if (model_buf) { - return std::make_shared(model_buf, weights, create_extensions_map()); + return std::make_shared(model_buf, weights, create_extensions_map(), std::move(weights_path)); } return nullptr; }; @@ -278,7 +284,7 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector& variants) const } } - return create_input_model(); + return create_input_model(ov::util::path_to_string(weights_path)); } std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) const { diff --git a/src/frontends/ir/src/input_model.cpp b/src/frontends/ir/src/input_model.cpp index 6c59617c69a48d..b4d9ef164e994c 100644 --- a/src/frontends/ir/src/input_model.cpp +++ b/src/frontends/ir/src/input_model.cpp @@ -205,13 +205,16 @@ class InputModel::InputModelIRImpl { std::unordered_map m_opsets; pugi::xml_node m_root; pugi::xml_document m_xml_doc; + std::string m_weights_path; public: InputModelIRImpl(std::istream& model, const std::shared_ptr& weights, - const std::unordered_map& extensions) + const std::unordered_map& extensions, + std::string weights_path) : m_weights(weights), - m_extensions(extensions) { + m_extensions(extensions), + m_weights_path(std::move(weights_path)) { pugi::xml_parse_result res = m_xml_doc.load(model); OPENVINO_ASSERT(res.status == pugi::status_ok, res.description(), " at offset ", res.offset); init_opset(); @@ -219,9 +222,11 @@ class InputModel::InputModelIRImpl { InputModelIRImpl(const std::shared_ptr& model, const std::shared_ptr& weights, - const std::unordered_map& extensions) + const std::unordered_map& extensions, + std::string weights_path) : m_weights(weights), - m_extensions(extensions) { + m_extensions(extensions), + m_weights_path(std::move(weights_path)) { auto res = m_xml_doc.load_buffer(model->get_ptr(), model->size(), pugi::parse_default, pugi::encoding_utf8); OPENVINO_ASSERT(res.status == pugi::status_ok, res.description(), " at offset ", res.offset); init_opset(); @@ -240,14 +245,16 @@ class InputModel::InputModelIRImpl { InputModel::InputModel(std::istream& model, const std::shared_ptr& weights, - const std::unordered_map& extensions) { - _impl = std::make_shared(model, weights, extensions); + const std::unordered_map& extensions, + std::string weights_path) { + _impl = std::make_shared(model, weights, extensions, std::move(weights_path)); } InputModel::InputModel(const std::shared_ptr& model, const std::shared_ptr& weights, - const std::unordered_map& extensions) { - _impl = std::make_shared(model, weights, extensions); + const std::unordered_map& extensions, + std::string weights_path) { + _impl = std::make_shared(model, weights, extensions, std::move(weights_path)); } std::shared_ptr InputModel::convert() { @@ -263,6 +270,8 @@ std::shared_ptr InputModel::InputModelIRImpl::convert() { std::shared_ptr model; visitor.on_attribute("net", model); model->get_rt_info()["version"] = int64_t(version); + if (!m_weights_path.empty()) + model->get_rt_info()["__weights_path"] = m_weights_path; parse_pre_process(m_root, m_weights, model); return model; diff --git a/src/frontends/ir/src/input_model.hpp b/src/frontends/ir/src/input_model.hpp index 331092749bbeb9..a9ad1224c6ca3a 100644 --- a/src/frontends/ir/src/input_model.hpp +++ b/src/frontends/ir/src/input_model.hpp @@ -22,11 +22,13 @@ class InputModel : public ov::frontend::InputModel { public: InputModel(std::istream& stream, const std::shared_ptr& weights, - const std::unordered_map& extensions); + const std::unordered_map& extensions, + std::string weights_path = {}); InputModel(const std::shared_ptr& model_buf, const std::shared_ptr& weights, - const std::unordered_map& extensions); + const std::unordered_map& extensions, + std::string weights_path = {}); std::shared_ptr convert(); }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index 28a20fa737da76..49a45ec9ffa11a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -44,6 +44,7 @@ class Plugin : public ov::IPlugin { bool is_metric(const std::string& name) const; ov::Any get_metric(const std::string& name, const ov::AnyMap& arguments) const; + void set_cache_info(const std::shared_ptr& model, ExecutionConfig& properties) const; public: Plugin(); diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 2d29601ef0b69d..9aba7ee1a117eb 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -162,6 +162,22 @@ Plugin::Plugin() { m_compiled_model_runtime_properties["OV_VERSION"] = ov_version.buildNumber; } +void Plugin::set_cache_info(const std::shared_ptr& model, ExecutionConfig& config) const { + // WEIGHTS_PATH is used for the weightless cache mechanism which is used only with + // ov::CacheMode::OPTIMIZE_SIZE setting. Not setting WEIGHTS_PATH will result in not + // using that mechanism. + if (config.get_property(ov::cache_mode) != ov::CacheMode::OPTIMIZE_SIZE) { + return; + } + + const auto& rt_info = model->get_rt_info(); + auto weights_path = rt_info.find("__weights_path"); + if (weights_path != rt_info.end()) { + ov::AnyMap weights_path_property{{"WEIGHTS_PATH", weights_path->second}}; + config.set_property(weights_path_property); + } +} + std::shared_ptr Plugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& orig_config) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model"); std::string device_id = get_device_id(orig_config); @@ -174,6 +190,8 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< config.set_user_property(orig_config); config.apply_user_properties(context->get_engine().get_device_info()); + set_cache_info(model, config); + auto transformed_model = clone_and_transform_model(model, config, context); { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model::CreateCompiledModel"); @@ -193,6 +211,8 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< config.set_user_property(orig_config); config.apply_user_properties(context_impl->get_engine().get_device_info()); + set_cache_info(model, config); + auto transformed_model = clone_and_transform_model(model, config, context_impl); return std::make_shared(transformed_model, shared_from_this(), context_impl, config); } diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index bcb6be2fe307e7..880868d8666560 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -60,7 +60,8 @@ void CheckWeightlessCacheAccuracy::TearDown() { } void CheckWeightlessCacheAccuracy::run() { - ov::AnyMap config = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), ov::weights_path(bin_path) }; + ov::AnyMap config = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE) }; + ov::AnyMap config_with_weights_path = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), ov::weights_path(bin_path) }; auto core = ov::test::utils::PluginCache::get().core(); ov::pass::Serialize(xml_path, bin_path).run_on_model(model); @@ -73,7 +74,7 @@ void CheckWeightlessCacheAccuracy::run() { auto ifstr = std::ifstream(cache_path, std::ifstream::binary); ov::CompiledModel imported_model; - OV_ASSERT_NO_THROW(imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config)); + OV_ASSERT_NO_THROW(imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path)); ifstr.close(); auto orig_req = compiled_model.create_infer_request(); From c21f572cc45193232d76aa21e821e92445b18725 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Fri, 25 Oct 2024 10:11:19 +0400 Subject: [PATCH 022/233] [GPU] Fixes for hybrid quantization (#27127) ### Details: - set LPT callbacks to handle compression and avoid constant folding for it (taken from https://github.com/openvinotoolkit/openvino/pull/20973) - Allow u8/i8 output data type for compressed onednn FC - Disable Dequantize propagation through Transpose if it's a dependency of SDPA to keep Transpose+SDPA fusion --- .../impls/onednn/fully_connected_onednn.hpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 59 +++++++++++++++---- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index f4495fb5dd1645..39423980521042 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool compressed_case = fc_prim->compressed_weights && one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && - one_of(out_dt, {data_types::f16, data_types::f32}); + one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index f97b7fae126b47..4b72385663bf9d 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -15,8 +15,11 @@ #include "intel_gpu/plugin/transformations_pipeline.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" +#include "low_precision/add.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" +#include "low_precision/fold_convert.hpp" +#include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" @@ -25,7 +28,9 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" +#include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" +#include "low_precision/transpose.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -46,6 +51,7 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/rnn_cell.hpp" #include "openvino/op/rnn_sequence.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -312,13 +318,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression // types are not supported by oneDNN) manager.register_pass(supported_woq_types, !device_info.supports_immad); - - // Need to check if transformations work correctly for mixed models with both compression and quantization at the same time. - if (!is_model_quantized) { - pass_config->set_callback([&](const std::shared_ptr node) { - return !is_decompression_multiply(node); - }); - } + pass_config->set_callback([&](const std::shared_ptr node) { + return !is_decompression_multiply(node); + }); const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -687,12 +689,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lptPassConfig = lptManager.get_pass_config(); // quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation lptPassConfig->disable(); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - if (const auto mulitply = std::dynamic_pointer_cast(node)) { - return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply); - } - return false; - }); lptPassConfig->set_callback([func, defaultPrecisions](const_node_ptr& node) -> bool { auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool { const auto rank = shape.rank(); @@ -729,6 +725,43 @@ void TransformationsPipeline::apply(std::shared_ptr func) { || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }); + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { + for (auto& user : node->get_users()) { + if (ov::is_type(user)) + return true; + } + + return false; + }); + + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + return ov::is_type(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node); + }); + + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + return ov::marked_as_bias(node); + }); + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + const auto& consumers = node->get_output_target_inputs(0); + if (consumers.size() == 1) { + const auto consumer = consumers.begin()->get_node()->shared_from_this(); + return ov::is_type(consumer) && is_decompression_multiply(consumer); + } + return false; + }); + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + if (ov::is_type(node)) { + return ov::is_type(node) && is_decompression_multiply(node); + } else if (ov::is_type(node)) { + const auto& consumers = node->get_output_target_inputs(0); + if (consumers.size() == 1) { + const auto consumer = consumers.begin()->get_node()->shared_from_this(); + return ov::is_type(consumer) && is_decompression_multiply(consumer); + } + } + return false; + }); + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { // disable MultiplyToGroupConvolution if Multiply with Constant can be fused From e19724e2c4815a06003937662da8508cf806d8a0 Mon Sep 17 00:00:00 2001 From: Nashez Zubair <35090095+nashez@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:04:52 +0530 Subject: [PATCH 023/233] [CPU][ARM64] Add JIT emitter for Eltwise Greater operation (#27145) ### Details: - Added a jit_greater_emitter derived class in aarch64/jit_eltwise_emitters - Created entry Algorithm::EltwiseGreater in the get_supported_precisions in nodes/kernels/aarch64 - Add the EltwiseGreater entry in the aarch64 executors supported algorithms ### Tickets: - Closes: #24421 Signed-off-by: Nashez Zubair --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 52 +++++++++++++++++++ .../plugin/aarch64/jit_eltwise_emitters.hpp | 28 ++++++++++ .../nodes/executors/aarch64/jit_eltwise.cpp | 1 + .../aarch64/jit_uni_eltwise_generic.cpp | 2 + 4 files changed, 83 insertions(+) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index d6208e0a43bbe1..17ce08f7159379 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -736,6 +736,58 @@ std::set> jit_gelu_tanh_emitter::get_supported_precis return {{element::f32}}; } +/// GREATER /// +jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); +} + +jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} + +size_t jit_greater_emitter::get_inputs_count() const { return 2; } + +size_t jit_greater_emitter::get_aux_vecs_count() const { return 1; } + +size_t jit_greater_emitter::get_aux_gprs_count() const { return 1; } + +void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + const TReg src1 = TReg(in_vec_idxs[0]); + const TReg src2 = TReg(in_vec_idxs[1]); + const TReg dst = TReg(out_vec_idxs[0]); + const TReg aux = TReg(aux_vec_idxs[0]); + + h->fcmgt(dst.s, src1.s, src2.s); + h->ld1r(aux.s, table_val2("one")); + h->and_(dst.b16, dst.b16, aux.b16); +} + +void jit_greater_emitter::register_table_entries() { + push_arg_entry_of("one", 0x3f800000, true); +} + +std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// GREATER_EQUAL /// jit_greater_equal_emitter::jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index afecd3029f58db..e2aff7557f7365 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -278,6 +278,34 @@ class jit_gelu_tanh_emitter : public jit_emitter { void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_greater_emitter : public jit_emitter { +public: + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + size_t get_aux_gprs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; +}; + class jit_greater_equal_emitter : public jit_emitter { public: jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 586e7f0705643f..3f1031255d1775 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -28,6 +28,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseFloor, Algorithm::EltwiseGeluErf, Algorithm::EltwiseGeluTanh, + Algorithm::EltwiseGreater, Algorithm::EltwiseGreaterEqual, Algorithm::EltwiseHswish, Algorithm::EltwiseIsFinite, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 98eb279bb26d48..04286a0c8aaf68 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -660,6 +660,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseMish, ov::intel_cpu::aarch64::jit_mish_emitter), OV_CASE(Algorithm::EltwiseGeluErf, ov::intel_cpu::aarch64::jit_gelu_erf_emitter), OV_CASE(Algorithm::EltwiseGeluTanh, ov::intel_cpu::aarch64::jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), @@ -829,6 +830,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), OV_CASE(Algorithm::EltwiseHswish, jit_hswish_emitter), OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), From c8415c38592e2fe1d18678433ad3f014ac9760da Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 25 Oct 2024 08:58:34 +0200 Subject: [PATCH 024/233] [ref]: SearchSorted: Made ref impl parallel. (#27181) ### Details: - Made ref impl parallel. Scaling is done via values space. ### Tickets: - *CVS-155707* Co-authored-by: Michal Lukaszewski --- .../include/openvino/reference/search_sorted.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/core/reference/include/openvino/reference/search_sorted.hpp b/src/core/reference/include/openvino/reference/search_sorted.hpp index ca5361c388c621..7ea8ec1078a2a1 100644 --- a/src/core/reference/include/openvino/reference/search_sorted.hpp +++ b/src/core/reference/include/openvino/reference/search_sorted.hpp @@ -4,6 +4,7 @@ #pragma once +#include "openvino/core/parallel.hpp" #include "openvino/core/shape.hpp" #include "openvino/reference/utils/coordinate_index.hpp" #include "openvino/reference/utils/coordinate_transform.hpp" @@ -30,7 +31,13 @@ void search_sorted(const T* sorted, }; } - for (const Coordinate& values_coord : values_transform) { + const size_t size = shape_size(values_shape); + + auto func = [&](size_t i) { + auto it = values_transform.begin(); + it += i; + const Coordinate& values_coord = *it; + const auto values_index = coordinate_index(values_coord, values_shape); const T value = values[values_index]; @@ -48,7 +55,9 @@ void search_sorted(const T* sorted, const ptrdiff_t sorted_index = (idx_ptr - sorted) - sorted_index_begin; out[values_index] = static_cast(sorted_index); - } + }; + + ov::parallel_for(size, func); } } // namespace reference From 051e676d2a1b9ac34360993f771504ccbf55b46e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 25 Oct 2024 07:07:55 +0000 Subject: [PATCH 025/233] Bump actions/cache from 4.1.1 to 4.1.2 (#27242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/cache](https://github.com/actions/cache) from 4.1.1 to 4.1.2.
Release notes

Sourced from actions/cache's releases.

v4.1.2

What's Changed

New Contributors

Full Changelog: https://github.com/actions/cache/compare/v4...v4.1.2

Changelog

Sourced from actions/cache's changelog.

Releases

4.1.2

  • Add GitHub Enterprise Cloud instances hostname filters to inform API endpoint choices - #1474
  • Security fix: Bump braces from 3.0.2 to 3.0.3 - #1475

4.1.1

  • Restore original behavior of cache-hit output - #1467

4.1.0

  • Ensure cache-hit output is set when a cache is missed - #1404
  • Deprecate save-always input - #1452

4.0.2

  • Fixed restore fail-on-cache-miss not working.

4.0.1

  • Updated isGhes check

4.0.0

  • Updated minimum runner version support from node 12 -> node 20

3.3.3

  • Updates @​actions/cache to v3.2.3 to fix accidental mutated path arguments to getCacheVersion actions/toolkit#1378
  • Additional audit fixes of npm package(s)

3.3.2

  • Fixes bug with Azure SDK causing blob downloads to get stuck.

3.3.1

  • Reduced segment size to 128MB and segment timeout to 10 minutes to fail fast in case the cache download is stuck.

3.3.0

  • Added option to lookup cache without downloading it.

3.2.6

  • Fix zstd not being used after zstd version upgrade to 1.5.4 on hosted runners.

3.2.5

... (truncated)

Commits
  • 6849a64 Release 4.1.2 #1477
  • 5a1720c Merge branch 'Link-/prep-4.1.2' of https://github.com/actions/cache into Link...
  • d9fef48 Merge branch 'main' into Link-/prep-4.1.2
  • a50e8d0 Merge branch 'main' into Link-/prep-4.1.2
  • acc9ae5 Merge pull request #1481 from actions/dependabot/github_actions/actions/setup...
  • 1ea5f18 Merge branch 'main' into Link-/prep-4.1.2
  • cc679ff Merge branch 'main' into dependabot/github_actions/actions/setup-node-4
  • 366d43d Merge pull request #1483 from actions/dependabot/github_actions/github/codeql...
  • 02bf319 Bump github/codeql-action from 2 to 3
  • 6f6220b Merge branch 'main' into dependabot/github_actions/actions/setup-node-4
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/cache&package-manager=github_actions&previous-version=4.1.1&new-version=4.1.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_doc.yml | 2 +- .github/workflows/job_cpu_functional_tests.yml | 4 ++-- .github/workflows/mo.yml | 2 +- .github/workflows/ovc.yml | 2 +- .github/workflows/windows_conditional_compilation.yml | 2 +- .github/workflows/windows_vs2019_release.yml | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index 8bf61839116bbd..6623f5ea182da1 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -63,7 +63,7 @@ jobs: - name: Cache documentation id: cache_sphinx_docs - uses: actions/cache@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: build/docs/_build/.doctrees key: sphinx-docs-cache diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml index 9e8dde29f7701d..0366ec47ff437e 100644 --- a/.github/workflows/job_cpu_functional_tests.yml +++ b/.github/workflows/job_cpu_functional_tests.yml @@ -89,7 +89,7 @@ jobs: run: python3 -m pip install -r ${INSTALL_TEST_DIR}/functional_test_utils/layer_tests_summary/requirements.txt - name: Restore tests execution time - uses: actions/cache/restore@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ${{ env.PARALLEL_TEST_CACHE }} key: ${{ runner.os }}-${{ runner.arch }}-tests-functional-cpu-stamp-${{ github.sha }} @@ -109,7 +109,7 @@ jobs: timeout-minutes: 25 - name: Save tests execution time - uses: actions/cache/save@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 if: github.ref_name == 'master' with: path: ${{ env.PARALLEL_TEST_CACHE }} diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml index 0f2fb5fd57752a..2405103755b552 100644 --- a/.github/workflows/mo.yml +++ b/.github/workflows/mo.yml @@ -32,7 +32,7 @@ jobs: python-version: '3.10' - name: Cache pip - uses: actions/cache@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('tools/mo/requirements*.txt') }} diff --git a/.github/workflows/ovc.yml b/.github/workflows/ovc.yml index 2745b2f232406c..7d18643def3ce6 100644 --- a/.github/workflows/ovc.yml +++ b/.github/workflows/ovc.yml @@ -27,7 +27,7 @@ jobs: python-version: '3.10' - name: Cache pip - uses: actions/cache@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('src/bindings/python/requirements*.txt') }} diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml index fb53404a736558..d93c737606a2b4 100644 --- a/.github/workflows/windows_conditional_compilation.yml +++ b/.github/workflows/windows_conditional_compilation.yml @@ -387,7 +387,7 @@ jobs: run: python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/layer_tests_summary/requirements.txt - name: Restore tests execution time - uses: actions/cache/restore@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ${{ env.PARALLEL_TEST_CACHE }} key: ${{ runner.os }}-tests-functional-cpu-stamp-${{ github.sha }} diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 22a91f5dc69b49..2f15efc443879f 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -481,7 +481,7 @@ jobs: run: python3 -m pip install -r ${{ github.workspace }}\install\tests\functional_test_utils\layer_tests_summary\requirements.txt - name: Restore tests execution time - uses: actions/cache/restore@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ${{ env.PARALLEL_TEST_CACHE }} key: ${{ runner.os }}-tests-functional-cpu-stamp-${{ github.sha }} @@ -495,7 +495,7 @@ jobs: timeout-minutes: 60 - name: Save tests execution time - uses: actions/cache/save@3624ceb22c1c5a301c8db4169662070a689d9ea8 # v4.1.1 + uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 if: github.ref_name == 'master' with: path: ${{ env.PARALLEL_TEST_CACHE }} From f1dc976a0ab2ee8e52eea987556a67cc1fb74396 Mon Sep 17 00:00:00 2001 From: Alexander Suslov Date: Fri, 25 Oct 2024 11:46:29 +0400 Subject: [PATCH 026/233] Support for INT4 decompression patterns from NNCF (#27048) ### Details: - [torch.compile] Added translation decompression subgraph to u4 in `translate_stack_fx` - [torch.compile] Added capture NNCF INT4 decompression patterns in `make_partitions` - [pytorch frontend] Added ov::frontend::pytorch::pass::U4ConvertReshape transformation to support NNCF INT4 decompression patterns ![U4ConvertReshape](https://github.com/user-attachments/assets/b25d5e89-047d-4ee6-a850-45e87fad66f8) - Updated `u4_compression_stack` transformation to support Torch FX flow. ### Tickets: - ref: 153918 --- .../frontend/pytorch/torchdynamo/partition.py | 19 ++ src/frontends/pytorch/src/frontend.cpp | 1 + src/frontends/pytorch/src/op/cat.cpp | 12 +- .../src/transforms/u4_block_repack.cpp | 75 ++++++++ .../src/transforms/u4_block_repack.hpp | 6 + src/frontends/pytorch/src/utils_quantize.cpp | 7 +- .../pytorch_tests/pytorch_layer_test_class.py | 4 + .../pytorch_tests/test_compressed_mm.py | 170 ++++++++++++++++++ 8 files changed, 290 insertions(+), 4 deletions(-) create mode 100644 tests/layer_tests/pytorch_tests/test_compressed_mm.py diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/partition.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/partition.py index a99fdb4ebc6d45..bb272b4f9adb53 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/partition.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/partition.py @@ -114,9 +114,28 @@ def capture_gptq_patterns(self, graph_module: GraphModule): for pattern_op in enabled_ops: self.supported_ops.enable_by_name(pattern_op) + def capture_nncf_patterns(self, graph_module: GraphModule): + const_node = PatternNode + const_node.op_types["get_attr"] = None + bitwise_right_shift_node = PatternNode + bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] + bitwise_and_node = PatternNode + bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node,] + stack_node = PatternNode + stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + + for node in graph_module.graph.nodes: + if str(node.op) == "call_function" and str(node.target) == "aten.stack.default": + enabled_ops = [] + pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops) + if pattern_match: + for pattern_op in enabled_ops: + self.supported_ops.enable_by_name(pattern_op) + def make_partitions(self, graph_module: GraphModule, options) -> GraphModule: allow_single_node_partition = _is_testing(options) self.capture_gptq_patterns(graph_module) + self.capture_nncf_patterns(graph_module) partitioner = CapabilityBasedPartitioner( graph_module, self.supported_ops, allows_single_node_partition=allow_single_node_partition) partitions = partitioner.propose_partitions() diff --git a/src/frontends/pytorch/src/frontend.cpp b/src/frontends/pytorch/src/frontend.cpp index 5906043e51262d..2b0ab6db9d3a09 100644 --- a/src/frontends/pytorch/src/frontend.cpp +++ b/src/frontends/pytorch/src/frontend.cpp @@ -320,6 +320,7 @@ void FrontEnd::normalize(const std::shared_ptr& model) const { model->get_rt_info().erase("symmetric_quantization"); } manager.register_pass(sym); + manager.register_pass(); manager.register_pass(); manager.run_passes(model); diff --git a/src/frontends/pytorch/src/op/cat.cpp b/src/frontends/pytorch/src/op/cat.cpp index 4ae2c4ebc81af4..d4f12cae258ad8 100644 --- a/src/frontends/pytorch/src/op/cat.cpp +++ b/src/frontends/pytorch/src/op/cat.cpp @@ -146,9 +146,17 @@ OutputVector translate_stack_fx(const NodeContext& context) { num_elements -= 1; } + OutputVector stack_inputs; for (size_t i = 0; i < num_elements; i++) { - auto stack_input = - context.mark_node(std::make_shared(context.get_input(static_cast(i)), dim)); + stack_inputs.push_back(context.get_input(static_cast(i))); + } + + // returns the u4 constant if the stack operation is a part of the decompression pattern + if (const auto& u4_const = u4_compression_stack(stack_inputs, axis)) + return {u4_const}; + + for (size_t i = 0; i < num_elements; i++) { + auto stack_input = context.mark_node(std::make_shared(stack_inputs[i], dim)); list_elems.push_back(stack_input); } return translate_cat_common(context, list_elems, axis, true); diff --git a/src/frontends/pytorch/src/transforms/u4_block_repack.cpp b/src/frontends/pytorch/src/transforms/u4_block_repack.cpp index 5130424d0c60ed..797fa531c43b60 100644 --- a/src/frontends/pytorch/src/transforms/u4_block_repack.cpp +++ b/src/frontends/pytorch/src/transforms/u4_block_repack.cpp @@ -6,10 +6,12 @@ #include "openvino/core/rt_info.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/subtract.hpp" #include "openvino/op/transpose.hpp" #include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "utils.hpp" #include "utils_quantize.hpp" @@ -122,6 +124,79 @@ U4BlockRepack::U4BlockRepack(bool is_symmetrical) { }); }; +U4ConvertReshape::U4ConvertReshape() { + const auto& m_constant = wrap_type(type_matches(element::u4)); + const auto& m_convert = wrap_type({m_constant}); + + const auto& m_constant_1 = wrap_type(); + const auto& m_convert_1 = wrap_type({m_constant_1}); + const auto& m_constant_8 = wrap_type(); + const auto& m_convert_8 = wrap_type({m_constant_8}); + const auto& m_multiply = wrap_type({m_convert_8, m_convert_1}); + + const auto& m_converted_constant_8 = + std::make_shared(ov::OutputVector{m_multiply, m_convert_8}); + const auto& m_subtract = wrap_type({m_convert, m_converted_constant_8}); + const auto& m_converted_constant = + std::make_shared(ov::OutputVector{m_subtract, m_constant}); + const auto& m_reshape = wrap_type({m_converted_constant, any_input()}); + + register_matcher( + std::make_shared(m_reshape, "ov::frontend::pytorch::pass::U4ConvertReshape"), + [=](Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto u4_const = + std::dynamic_pointer_cast(pattern_to_output[m_constant].get_node_shared_ptr()); + if (!u4_const) + return false; + + if (u4_const->get_element_type() != element::u4) + return false; + + auto reshape = pattern_to_output[m_reshape].get_node_shared_ptr(); + auto dst_shape = reshape->get_output_shape(0); + + std::shared_ptr new_const; + if (pattern_to_output.count(m_constant_8)) { + auto constant_8 = std::dynamic_pointer_cast( + pattern_to_output[m_constant_8].get_node_shared_ptr()); + if (ov::shape_size(constant_8->get_output_shape(0)) != 1 || + constant_8->get_output_element_type(0).is_real() || constant_8->cast_vector()[0] != 8) + return false; + + if (pattern_to_output.count(m_constant_1)) { + auto constant_1 = std::dynamic_pointer_cast( + pattern_to_output[m_constant_1].get_node_shared_ptr()); + if (ov::shape_size(constant_1->get_output_shape(0)) != 1 || + constant_1->get_output_element_type(0).is_real() || constant_1->cast_vector()[0] != 1) + return false; + } + + new_const = std::make_shared(element::i4, dst_shape); + auto dst = const_cast(reinterpret_cast(new_const->get_data_ptr())); + + auto src = u4_const->get_data_ptr(); + auto num_elements = ov::shape_size(u4_const->get_output_shape(0)); + + for (size_t i = 0; i < num_elements / 2; ++i) { + // subtracting 8 from 2 int4 elements + dst[i] = src[i] ^ 0b10001000; + } + } else { + new_const = std::make_shared(*u4_const, dst_shape); + } + + NodeVector pattern_nodes; + for (auto const& iout : pattern_to_output) + pattern_nodes.push_back(std::move(iout.first)); + + copy_runtime_info(pattern_nodes, new_const); + replace_node(reshape, new_const); + + return true; + }); +}; + } // namespace pass } // namespace pytorch } // namespace frontend diff --git a/src/frontends/pytorch/src/transforms/u4_block_repack.hpp b/src/frontends/pytorch/src/transforms/u4_block_repack.hpp index 6ab65a5d1c3838..99742ff148813a 100644 --- a/src/frontends/pytorch/src/transforms/u4_block_repack.hpp +++ b/src/frontends/pytorch/src/transforms/u4_block_repack.hpp @@ -18,6 +18,12 @@ class U4BlockRepack : public ov::pass::MatcherPass { U4BlockRepack(bool is_symmetrical = false); }; +class U4ConvertReshape : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::pytorch::pass::U4ConvertReshape"); + U4ConvertReshape(); +}; + } // namespace pass } // namespace pytorch } // namespace frontend diff --git a/src/frontends/pytorch/src/utils_quantize.cpp b/src/frontends/pytorch/src/utils_quantize.cpp index e48c61314f4c0d..ad8f91bcda25a2 100644 --- a/src/frontends/pytorch/src/utils_quantize.cpp +++ b/src/frontends/pytorch/src/utils_quantize.cpp @@ -237,12 +237,15 @@ std::shared_ptr u4_compression_stack(const OutputVector& list_elems, int64 return nullptr; } - auto bitwise_shift = cast_fw_node(list_elems[1].get_node_shared_ptr(), "aten::bitwise_right_shift"); + auto bitwise_shift = cast_fw_node(list_elems[1].get_node_shared_ptr(), + {"aten::bitwise_right_shift", "aten.bitwise_right_shift.Tensor_Scalar"}); if (!bitwise_shift) return nullptr; auto weights_u8 = std::dynamic_pointer_cast(bitwise_and->get_input_node_shared_ptr(0)); - if (weights_u8 != std::dynamic_pointer_cast(bitwise_shift->get_input_node_shared_ptr(0))) + auto weights_u8_bitwise_shift = + std::dynamic_pointer_cast(bitwise_shift->get_input_node_shared_ptr(0)); + if (weights_u8->get_data_ptr() != weights_u8_bitwise_shift->get_data_ptr()) return nullptr; if (weights_u8->get_output_element_type(0) != element::u8) diff --git a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py index a44ca8c0117a4b..0672ad05dec2bb 100644 --- a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py +++ b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py @@ -151,6 +151,8 @@ def numpy_to_torch_recursively(x): config = {} if ie_device == "GPU" and precision == "FP32": config[hints.inference_precision] = Type.f32 + if "dynamic_quantization_group_size" in kwargs: + config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(kwargs["dynamic_quantization_group_size"]) compiled = core.compile_model(converted_model, ie_device, config) infer_res = compiled(deepcopy(ov_inputs)) @@ -289,6 +291,8 @@ def torch_compile_backend_test(self, model, inputs, **kwargs): options={"testing": 1,} if ("aot_autograd" in kwargs): options.update({"aot_autograd": True,}) + if "dynamic_quantization_group_size" in kwargs: + options["config"] = {"DYNAMIC_QUANTIZATION_GROUP_SIZE": str(kwargs["dynamic_quantization_group_size"])} dynamic = False if ("dynamic" in kwargs): dynamic = kwargs["dynamic"] diff --git a/tests/layer_tests/pytorch_tests/test_compressed_mm.py b/tests/layer_tests/pytorch_tests/test_compressed_mm.py new file mode 100644 index 00000000000000..a4261f90e2d2d6 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_compressed_mm.py @@ -0,0 +1,170 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import torch + +from pytorch_layer_test_class import PytorchLayerTest + + +def pack_uint4(tensor): + packed_tensor = tensor.contiguous() + packed_tensor = packed_tensor.reshape(-1, 2) + packed_tensor = torch.bitwise_and(packed_tensor[..., ::2], 15) | packed_tensor[..., 1::2] << 4 + return packed_tensor + + +def unpack_uint4(packed_tensor): + return torch.stack((torch.bitwise_and(packed_tensor, 15), torch.bitwise_right_shift(packed_tensor, 4)), dim=-1) + + +def pack_int4(tensor): + tensor = tensor + 8 + return pack_uint4(tensor.type(torch.uint8)) + + +def unpack_int4(packed_tensor): + t = unpack_uint4(packed_tensor) + return t.type(torch.int8) - 8 + + +def decompress_asymmetric(input, scale, zero_point): + input = input.type(dtype=scale.dtype) + zero_point = zero_point.type(dtype=scale.dtype) + decompressed_input = (input - zero_point) * scale + return decompressed_input + + +def decompress_symmetric(input, scale): + input = input.type(dtype=scale.dtype) + decompressed_input = input * scale + return decompressed_input + + +class TestMatMulU4Weights(PytorchLayerTest): + rng = np.random.default_rng(seed=123) + + def _prepare_input(self): + return (np.round(5.00 * self.rng.random([2, 4], dtype=np.float32) - 2.50, 4),) + + def create_model(self, group_size): + class aten_mm_u4(torch.nn.Module): + def __init__(self, compressed_weight, scale, zero_point, weight_shape): + super(aten_mm_u4, self).__init__() + self.compressed_weight_shape = compressed_weight.shape + self.packed_weight = torch.nn.Parameter(pack_uint4(compressed_weight), requires_grad=False) + + self.register_buffer("_scale", scale.type(dtype=torch.float16)) + + self.zero_point_shape = zero_point.shape + self.register_buffer("_zero_point", pack_uint4(zero_point)) + + self.weight_shape = weight_shape + + def forward(self, x): + # NNCF UINT4 asymmetric decompression pattern + # https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/layers.py + compressed_weight = unpack_uint4(self.packed_weight) + compressed_weight = compressed_weight.reshape(self.compressed_weight_shape) + + zero_point = unpack_uint4(self._zero_point) + zero_point = zero_point.reshape(self.zero_point_shape) + + weight = decompress_asymmetric(compressed_weight, self._scale, zero_point) + weight = weight.reshape(self.weight_shape) + weight = weight.type(dtype=torch.float32) + + return torch.matmul(x, weight) + + ref_net = None + + weight_shape = (4, 2) + ngroups = weight_shape[0] // group_size + compressed_weight_shape = (ngroups, group_size, weight_shape[1]) + zero_point_shape = scale_shape = (ngroups, 1, weight_shape[1]) + + compressed_weight = (15.00 * self.rng.random(compressed_weight_shape, dtype=np.float32)).astype(dtype=np.uint8) + scale = np.round(10.00 * self.rng.random(scale_shape, dtype=np.float32) - 5.00) + zero_point = (15.00 * self.rng.random(zero_point_shape, dtype=np.float32)).astype(dtype=np.uint8) + + t_compressed_weight = torch.from_numpy(compressed_weight) + t_scale = torch.from_numpy(scale) + t_zero_point = torch.from_numpy(zero_point) + + return aten_mm_u4(t_compressed_weight, t_scale, t_zero_point, weight_shape), ref_net, ["aten::matmul"] + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.precommit_fx_backend + @pytest.mark.parametrize("group_size", [2, 4]) + def test_matmul_u4(self, group_size, ie_device, precision, ir_version): + self._test( + *self.create_model(group_size), + ie_device, + precision, + ir_version, + trace_model=True, + dynamic_quantization_group_size=0 + ) + + +class TestMatMulI4Weights(PytorchLayerTest): + rng = np.random.default_rng(seed=123) + + def _prepare_input(self): + return (np.round(5.00 * self.rng.random([2, 4], dtype=np.float32) - 2.50, 4),) + + def create_model(self, group_size): + class aten_mm_i4(torch.nn.Module): + def __init__(self, compressed_weight, scale, weight_shape): + super(aten_mm_i4, self).__init__() + self.compressed_weight_shape = compressed_weight.shape + self.packed_weight = torch.nn.Parameter(pack_int4(compressed_weight), requires_grad=False) + + self.register_buffer("_scale", scale.type(dtype=torch.float16)) + + self.weight_shape = weight_shape + + def forward(self, x): + # NNCF INT4 symmetric decompression pattern + # https://github.com/openvinotoolkit/nncf/blob/develop/nncf/torch/quantization/layers.py + compressed_weight = unpack_int4(self.packed_weight) + compressed_weight = compressed_weight.reshape(self.compressed_weight_shape) + + weight = decompress_symmetric(compressed_weight, self._scale) + weight = weight.reshape(self.weight_shape) + weight = weight.type(dtype=torch.float32) + + return torch.matmul(x, weight) + + ref_net = None + + weight_shape = (4, 2) + ngroups = weight_shape[0] // group_size + compressed_weight_shape = (ngroups, group_size, weight_shape[1]) + scale_shape = (ngroups, 1, weight_shape[1]) + + compressed_weight = (16.00 * self.rng.random(compressed_weight_shape, dtype=np.float32) - 8.00).astype( + dtype=np.int8 + ) + scale = np.round(10.00 * self.rng.random(scale_shape, dtype=np.float32) - 5.00) + + t_compressed_weight = torch.from_numpy(compressed_weight) + t_scale = torch.from_numpy(scale) + + return aten_mm_i4(t_compressed_weight, t_scale, weight_shape), ref_net, ["aten::matmul"] + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.precommit_fx_backend + @pytest.mark.parametrize("group_size", [2, 4]) + def test_matmul_i4(self, group_size, ie_device, precision, ir_version): + self._test( + *self.create_model(group_size), + ie_device, + precision, + ir_version, + trace_model=True, + dynamic_quantization_group_size=0 + ) From 940481990dd3deac9c05ebb8433b789880c9e42b Mon Sep 17 00:00:00 2001 From: Sungeun Kim Date: Fri, 25 Oct 2024 17:35:49 +0900 Subject: [PATCH 027/233] [GPU] add XE3 to build (#26811) - update cmake to add xe3 - update onednn rls-v3.7-pc --- .../intel_gpu/include/intel_gpu/runtime/device_info.hpp | 1 + .../intel_gpu/src/kernel_selector/kernel_selector_params.h | 1 + .../src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp | 6 ++++-- src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 1 + src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 2 +- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 6 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 0c83877851b48b..d44b8c0536fe4a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -33,6 +33,7 @@ enum class gpu_arch { xe_hpg = 5, xe_hpc = 6, xe2 = 7, + xe3 = 8, }; /// @brief Defines version of GFX IP diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h index 7daccaf99a2fa6..3a0c2224dd7358 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h @@ -369,6 +369,7 @@ enum class gpu_arch { xe_hpg = 5, xe_hpc = 6, xe2 = 7, + xe3 = 8, }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 04ebf2f2165973..974d4532c84e60 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -30,7 +30,8 @@ size_t subgroup_size(gpu_arch arch) { case gpu_arch::xe_hp: case gpu_arch::xe_hpg: return 8; case gpu_arch::xe_hpc: - case gpu_arch::xe2: return 16; + case gpu_arch::xe2: + case gpu_arch::xe3: return 16; default: return 0; } } @@ -205,7 +206,8 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag break; } case gpu_arch::xe_hpc: - case gpu_arch::xe2: { + case gpu_arch::xe2: + case gpu_arch::xe3: { config = choose_config_xehpc(static_cast(head_size), static_cast(n_keys.v), thin_q); break; } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index be8b48b157b421..88801b8b2b4e61 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -61,6 +61,7 @@ gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { case ngen::HW::XeHPG: return gpu_arch::xe_hpg; case ngen::HW::XeHPC: return gpu_arch::xe_hpc; case ngen::HW::Xe2: return gpu_arch::xe2; + case ngen::HW::Xe3: return gpu_arch::xe3; case ngen::HW::Gen10: case ngen::HW::Unknown: return gpu_arch::unknown; } diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index d54ccd4e4e6fe2..3bf6853346d61a 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -13,7 +13,7 @@ if(ENABLE_ONEDNN_FOR_GPU) set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library") set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root") set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION;SDPA") - set(ONEDNN_ENABLED_ISA "XEHPG;XEHPC;XE2") + set(ONEDNN_ENABLED_ISA "XEHPG;XEHPC;XE2;XE3") set(DNNL_GPU_LIBRARY_NAME "openvino_onednn_gpu" CACHE STRING "Name of oneDNN library for Intel GPU Plugin") if(X86_64) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 44160a115b0443..e99a84e4914a81 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 44160a115b0443e94e15d799b267b1d05dbf274d +Subproject commit e99a84e4914a818c64165a4b52785f606e405c2b From 8d6f51746780fea2d229720afc83fad0a1815c9f Mon Sep 17 00:00:00 2001 From: Denis Orlov Date: Fri, 25 Oct 2024 10:47:01 +0100 Subject: [PATCH 028/233] [GHA] Increase timeout for PyTorch Models tests in precommit (#27247) ### Tickets: - 155619 --- .github/workflows/job_pytorch_models_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml index dd15ae183d692c..40ee93dd08b9f2 100644 --- a/.github/workflows/job_pytorch_models_tests.yml +++ b/.github/workflows/job_pytorch_models_tests.yml @@ -22,7 +22,7 @@ permissions: read-all jobs: PyTorch_Models_Tests: name: PyTorch Models tests - timeout-minutes: ${{ inputs.model_scope == 'precommit' && 35 || 400 }} + timeout-minutes: ${{ inputs.model_scope == 'precommit' && 40 || 400 }} runs-on: ${{ inputs.runner }} container: ${{ fromJSON(inputs.container) }} defaults: From 2daa8b97264fb36e752e93670d511648bf119b46 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 25 Oct 2024 12:33:08 +0200 Subject: [PATCH 029/233] [Transformation] LoraSubgraph fusion (#27068) ### Details: - *Introduced `LoraSubgraph` operation, which is used for LoRA subgraphs fusion for further optimizations (for the details, please refer to the description in the header)* - *Introduced `LoraSubgraphFusion` pass* - *The changes are covered by transformation tests* ### Tickets: - *CVS-153035* - *CVS-155112* --- .../include/ov_ops/lora_subgraph.hpp | 38 +++ .../lora_subgraph_fusion.hpp | 25 ++ .../src/ov_ops/lora_subgraph.cpp | 41 +++ .../lora_subgraph_fusion.cpp | 108 ++++++++ .../lora_subgraph_fusion.cpp | 245 ++++++++++++++++++ 5 files changed, 457 insertions(+) create mode 100644 src/common/transformations/include/ov_ops/lora_subgraph.hpp create mode 100644 src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp create mode 100644 src/common/transformations/src/ov_ops/lora_subgraph.cpp create mode 100644 src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp create mode 100644 src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp new file mode 100644 index 00000000000000..75aaa16a5d280e --- /dev/null +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "openvino/op/util/sub_graph_base.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { +/** + * @interface LoraSubgraph + * @brief LoraSubgraph operation, which is used for LoRA subgraphs fusion. + * It always has only 1 output, and the following inputs, whose order is fixed: + * 1. main_flow_input: input from original model. + * 2. LoRA_input: input to which the Low-Rank adaptation is applied. + * The adapted input is combined with `main_flow_input`. + * 3. LoRA_matrices: 3 Low-Rank adaptation matrices applied to `LoRA_input`. + * The fused subgraph can be optimized in runtime based on LoRA semantic. + * For instance, `main_flow_input` can be fast-forwarded to output in case of empty `LoRA_matrices`. + */ +class TRANSFORMATIONS_API LoraSubgraph : public ov::op::util::SubGraphOp { +public: + OPENVINO_OP("LoraSubgraph", "ie_internal_opset", ov::op::util::SubGraphOp); + + LoraSubgraph() = default; + LoraSubgraph(const OutputVector& args, const std::shared_ptr& body); + + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp new file mode 100644 index 00000000000000..8422ad95f262c6 --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API LoraSubgraphFusion; + +} // namespace pass +} // namespace ov + +class ov::pass::LoraSubgraphFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("LoraSubgraphFusion", "0"); + LoraSubgraphFusion(); +}; diff --git a/src/common/transformations/src/ov_ops/lora_subgraph.cpp b/src/common/transformations/src/ov_ops/lora_subgraph.cpp new file mode 100644 index 00000000000000..8a7a5a75c69c7e --- /dev/null +++ b/src/common/transformations/src/ov_ops/lora_subgraph.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/lora_subgraph.hpp" + +#include "itt.hpp" + +namespace ov { +namespace op { +namespace internal { + +LoraSubgraph::LoraSubgraph(const OutputVector& args, const std::shared_ptr& body) : SubGraphOp(args) { + SubGraphOp::set_function(body); + for (size_t i = 0; i < body->get_parameters().size(); ++i) + m_input_descriptions[0].push_back(std::make_shared(i, i)); + for (size_t i = 0; i < body->get_output_size(); ++i) + m_output_descriptions[0].push_back(std::make_shared(i, i)); + constructor_validate_and_infer_types(); +} + +std::shared_ptr LoraSubgraph::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(internal_LoraSubgraph_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args, get_function()->clone()); +} + +void LoraSubgraph::validate_and_infer_types() { + INTERNAL_OP_SCOPE(internal_LoraSubgraph_validate_and_infer_types); + OPENVINO_ASSERT(get_input_size() == 5, "LoraSubgraph must have 5 inputs whereas it has ", get_input_size()); + OPENVINO_ASSERT(get_output_size() == 1, "LoraSubgraph must have 1 output whereas it has ", get_output_size()); + const auto& body = get_function(); + OPENVINO_ASSERT(body, "LoraSubgraph must have initialized body"); + validate_and_infer_type_body(body, m_input_descriptions[0]); + for (size_t i = 0; i < get_output_size(); ++i) + set_output_type(i, body->get_output_element_type(i), body->get_output_partial_shape(i)); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp new file mode 100644 index 00000000000000..366ce00894242e --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" + +#include +#include + +#include "itt.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/util/read_value_base.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/lora_subgraph.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { + MATCHER_SCOPE(LoraSubgraphFusion); + using namespace pass::pattern; + auto lora_input_m = any_input(); + auto transpose_const1_m = wrap_type(consumers_count(1)); + auto transpose1_m = optional({lora_input_m, transpose_const1_m}, consumers_count(1)); + auto read_value1_m = wrap_type(); + auto matmul1_m = wrap_type({transpose1_m, read_value1_m}, consumers_count(1)); + auto read_value2_m = wrap_type(); + auto multiply_m = wrap_type({matmul1_m, read_value2_m}, consumers_count(1)); + auto read_value3_m = wrap_type(); + auto matmul2_m = wrap_type({multiply_m, read_value3_m}, consumers_count(1)); + auto transpose_const2_m = wrap_type(consumers_count(1)); + auto transpose2_m = optional({matmul2_m, transpose_const2_m}, consumers_count(1)); + auto main_flow_m = wrap_type({lora_input_m, any_input()}); + auto add_m = wrap_type({transpose2_m, main_flow_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + const auto& lora_input = pattern_map.at(lora_input_m); + const auto& matmul1 = pattern_map.at(matmul1_m); + const auto& read_value1 = pattern_map.at(read_value1_m); + const auto& multiply = pattern_map.at(multiply_m); + const auto& read_value2 = pattern_map.at(read_value2_m); + const auto& matmul2 = pattern_map.at(matmul2_m); + const auto& read_value3 = pattern_map.at(read_value3_m); + const auto& main_flow = pattern_map.at(main_flow_m); + const auto& add = pattern_map.at(add_m); + + const auto add_node = add.get_node_shared_ptr(); + if (transformation_callback(add_node)) { + return false; + } + + auto find_connected_input = [](ov::Node* child, ov::Node* parent) { + for (size_t i = 0; i < child->get_input_size(); ++i) { + auto input = child->input(i); + if (input.get_source_output().get_node() == parent) + return input; + } + OPENVINO_THROW("Ops are not connected"); + }; + + // Note: internal_inputs/external_connections order corresponds to LoraSubgraph semantic + const std::vector> internal_inputs{ + // For commutative eltwise ops, input idx may be any, so it must be computed + find_connected_input(add.get_node(), main_flow.get_node()), + pattern_map.count(transpose1_m) ? pattern_map.at(transpose1_m).get_node()->input(0) + : matmul1.get_node()->input(0), + matmul1.get_node()->input(1), + find_connected_input(multiply.get_node(), read_value2.get_node()), + matmul2.get_node()->input(1), + }; + const ov::OutputVector external_connections{ + main_flow, + lora_input, + read_value1, + read_value2, + read_value3, + }; + + ov::ParameterVector subgraph_parameters; + subgraph_parameters.reserve(internal_inputs.size()); + for (auto& in : internal_inputs) { + auto new_parameter = std::make_shared(in.get_element_type(), in.get_partial_shape()); + subgraph_parameters.push_back(new_parameter); + in.replace_source_output(new_parameter); + } + // Note: lora consumers should be taken before lora_subgraph creation, + // because only original consumers should be replaced with lora's output + const auto& lora_consumers = add.get_target_inputs(); + const auto lora_subgraph = std::make_shared(ov::OutputVector{add}, subgraph_parameters); + const auto lora_node = std::make_shared(external_connections, lora_subgraph); + ov::copy_runtime_info(m.get_matched_nodes(), lora_node); + lora_node->set_friendly_name(add_node->get_friendly_name()); + + for (const auto& consumer : lora_consumers) + consumer.replace_source_output(lora_node->output(0)); + if (!add.get_names().empty()) + lora_node->output(0).set_names(add.get_names()); + return true; + }; + + auto m = std::make_shared(add_m, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp new file mode 100644 index 00000000000000..6557f763c6b368 --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp @@ -0,0 +1,245 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" + +#include + +#include + +#include "common_test_utils/node_builders/convolution.hpp" +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/core/model.hpp" +#include "openvino/opsets/opset15.hpp" +#include "ov_ops/lora_subgraph.hpp" +#include "transformations/utils/utils.hpp" + +using namespace testing; +using namespace ov; + +static constexpr auto netType = ov::element::f32; + +std::pair create_states(const std::vector& shapes) { + ov::OutputVector read_values; + ov::SinkVector assigns; + size_t idx = 0; + auto create_state = [&](const ov::PartialShape& shape) { + auto variable = + std::make_shared(ov::op::util::VariableInfo{shape, netType, std::to_string(idx++)}); + auto read_value = std::make_shared(variable); + auto assign = std::make_shared(read_value, variable); + read_values.push_back(read_value); + assigns.push_back(assign); + }; + for (const auto& shape : shapes) + create_state(shape); + return std::make_pair(read_values, assigns); +} + +std::shared_ptr create_lora_subgraph(const ov::Output& main_flow, + const ov::Output& lora_input, + const ov::OutputVector& states, + bool add_transposes, + size_t mul_read_value_idx = 1, + size_t add_data_flow_idx = 0) { + OPENVINO_ASSERT(states.size() == 3, "get_lora_subgraph expects states size == 3"); + OPENVINO_ASSERT(mul_read_value_idx == 0 || mul_read_value_idx == 1, "mul_read_value_idx must be 0 or 1"); + OPENVINO_ASSERT(add_data_flow_idx == 0 || add_data_flow_idx == 1, "add_data_flow_idx must be 0 or 1"); + + auto create_transpose = [](const ov::Output& input) -> ov::Output { + auto constant = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {2, 3, 0, 1}); + return std::make_shared(input, constant); + }; + + const auto& mm1_input = add_transposes ? create_transpose(lora_input) : lora_input; + auto mm1 = std::make_shared(mm1_input, states[0], false, true); + + const auto& mul_in_0 = mul_read_value_idx == 0 ? states[1] : mm1->output(0); + const auto& mul_in_1 = mul_read_value_idx == 0 ? mm1->output(0) : states[1]; + auto mul = std::make_shared(mul_in_0, mul_in_1); + + auto mm2 = std::make_shared(mul, states[2], false, true); + + const auto& add_sec_input = add_transposes ? create_transpose(mm2) : mm2; + const auto& add_in_0 = add_data_flow_idx == 0 ? main_flow : add_sec_input; + const auto& add_in_1 = add_data_flow_idx == 0 ? add_sec_input : main_flow; + return std::make_shared(add_in_0, add_in_1); +} + +class LoraSubgraphFusionTests : public TransformationTestsF { +public: + LoraSubgraphFusionTests() : TransformationTestsF() { + // TODO: remove when these flags will be enabled in TransformationTestsF (ticket XXX-98039) + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); + } + + void SetUp() override { + TransformationTestsF::SetUp(); + manager.register_pass(); + } +}; + +class LoraSubgraphFusionMatMulTests : public LoraSubgraphFusionTests { +public: + const ov::Dimension K = 563; + const ov::Dimension N = 2048; + ov::PartialShape shape_x = {-1, -1, K}; + ov::PartialShape shape_w = {N, K}; + ov::PartialShape shape_state_1 = {-1, K}; + ov::PartialShape shape_state_2 = {1, -1}; + ov::PartialShape shape_state_3 = {N, -1}; +}; + +TEST_F(LoraSubgraphFusionMatMulTests, StandardPattern) { + { + auto param_lora = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(main_mm, param_lora, states.first, false); + lora_subgraph->set_friendly_name("lora_subgraph"); + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, + states.second, + ParameterVector{param_lora, param_w}); + } + { + auto param_lora = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); + + auto inner_param_lora = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_mm, inner_param_lora, states_outs, false); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_mm, inner_param_lora, inner_state_1, inner_state_2, inner_state_3}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{main_mm, param_lora, states.first[0], states.first[1], states.first[2]}; + auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); + + model_ref = + std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_lora, param_w}); + } +} + +TEST_F(LoraSubgraphFusionMatMulTests, ReshaffledEltwiseInputs) { + { + auto param_lora = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(main_mm, param_lora, states.first, false, 0, 1); + lora_subgraph->set_friendly_name("lora_subgraph"); + + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, + states.second, + ParameterVector{param_lora, param_w}); + } + { + auto param_lora = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); + + auto inner_param_lora = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_mm, inner_param_lora, states_outs, false, 0, 1); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_mm, inner_param_lora, inner_state_1, inner_state_2, inner_state_3}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{main_mm, param_lora, states.first[0], states.first[1], states.first[2]}; + auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); + + model_ref = + std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_lora, param_w}); + } +} + +class LoraSubgraphFusionConvolutionTests : public LoraSubgraphFusionTests { +public: + const ov::Dimension num_channels = 320; + ov::PartialShape shape_x = {-1, num_channels, -1, -1}; + ov::PartialShape shape_state_1 = {-1, num_channels}; + ov::PartialShape shape_state_2 = {1, -1}; + ov::PartialShape shape_state_3 = {num_channels, -1}; +}; + +TEST_F(LoraSubgraphFusionConvolutionTests, StandardPattern) { + { + auto param_lora = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_lora, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels.get_length()); + main_conv->set_friendly_name("main_conv"); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(main_conv, param_lora, states.first, true); + lora_subgraph->set_friendly_name("lora_subgraph"); + model = + std::make_shared(OutputVector{lora_subgraph, main_conv}, states.second, ParameterVector{param_lora}); + } + { + auto param_lora = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_lora, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels.get_length()); + main_conv->set_friendly_name("main_conv"); + + auto inner_param_lora = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_conv = + std::make_shared(netType, main_conv->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_conv, inner_param_lora, states_outs, true); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_conv, + inner_param_lora, + inner_state_1, + inner_state_2, + inner_state_3}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{main_conv, param_lora, states.first[0], states.first[1], states.first[2]}; + auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); + + model_ref = std::make_shared(OutputVector{lora, main_conv}, states.second, ParameterVector{param_lora}); + } +} From 326dfa06c9d0994915711aa477609bda36985958 Mon Sep 17 00:00:00 2001 From: Georgy Krivoruchko Date: Fri, 25 Oct 2024 14:39:11 +0400 Subject: [PATCH 030/233] [ONNX] Switched to ONNX 1.17.0 (#26916) ### Details: - Switched to ONNX 1.17.0 ### Tickets: - N/A --- conan.lock | 4 ++-- conanfile.txt | 2 +- src/bindings/python/constraints.txt | 2 +- src/frontends/onnx/tests/__init__.py | 3 +++ .../onnx/tests/ci_utils/onnxruntime/version | 2 +- .../onnx/tests/tests_python/test_backend.py | 16 +++++++++++++++- thirdparty/onnx/onnx | 2 +- tools/constraints.txt | 2 +- 8 files changed, 25 insertions(+), 8 deletions(-) diff --git a/conan.lock b/conan.lock index f0cf4c64529cfc..f78ad37f8f69ac 100644 --- a/conan.lock +++ b/conan.lock @@ -10,7 +10,7 @@ "opencl-icd-loader/2023.04.17#5f73dd9f0c023d416a7f162e320b9c77%1692732261.088", "opencl-headers/2023.04.17#3d98f2d12a67c2400de6f11d5335b5a6%1683936272.16", "opencl-clhpp-headers/2023.04.17#7c62fcc7ac2559d4839150d2ebaac5c8%1685450803.672", - "onnx/1.16.2#b5e8d35b10d454b26751762922465eb8%1712404811.278", + "onnx/1.17.0#c79fdfca3ae149874153de15a20f4598%1727864447.241", "onetbb/2021.10.0#cbb2fc43088070b48f6e4339bc8fa0e1%1693812561.235", "ittapi/3.24.0#9246125f13e7686dee2b0c992b71db94%1682969872.743", "hwloc/2.9.2#1c63e2eccac57048ae226e6c946ebf0e%1688677682.002", @@ -33,4 +33,4 @@ ], "python_requires": [], "config_requires": [] -} \ No newline at end of file +} diff --git a/conanfile.txt b/conanfile.txt index b35a8bda22543a..46d6d8d65d34e9 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -7,7 +7,7 @@ opencl-icd-loader/[>=2023.04.17] rapidjson/[>=1.1.0] xbyak/[>=6.62] snappy/[>=1.1.7] -onnx/1.16.2 +onnx/1.17.0 pybind11/[>=2.12.0] flatbuffers/[>=22.9.24] diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt index 4e603a41c886cc..25e774bbbbdb59 100644 --- a/src/bindings/python/constraints.txt +++ b/src/bindings/python/constraints.txt @@ -23,4 +23,4 @@ paddlepaddle==2.6.0 tensorflow>=1.15.5,<2.18.0 six~=1.16.0 protobuf>=3.18.1,<4.0.0 -onnx==1.16.0 +onnx==1.17.0 diff --git a/src/frontends/onnx/tests/__init__.py b/src/frontends/onnx/tests/__init__.py index fb29faa38b46ec..ef8cebfa361e3f 100644 --- a/src/frontends/onnx/tests/__init__.py +++ b/src/frontends/onnx/tests/__init__.py @@ -182,3 +182,6 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True): xfail_issue_139936 = xfail_test(reason = "MaxPool accuracy fails") xfail_issue_139937 = xfail_test(reason = "GroupNorm, QLinearMatMul, DequantizeLinear translation failed") xfail_issue_139938 = xfail_test(reason = "QLinearMatMul accuracy fails") + +# ONNX 1.17 +skip_issue_119896 = pytest.mark.skip(reason="Unsupported element type: FLOAT8") diff --git a/src/frontends/onnx/tests/ci_utils/onnxruntime/version b/src/frontends/onnx/tests/ci_utils/onnxruntime/version index 85c399f04d78df..774958e19512de 100644 --- a/src/frontends/onnx/tests/ci_utils/onnxruntime/version +++ b/src/frontends/onnx/tests/ci_utils/onnxruntime/version @@ -1 +1 @@ -rel-1.18.1 +rel-1.19.2 diff --git a/src/frontends/onnx/tests/tests_python/test_backend.py b/src/frontends/onnx/tests/tests_python/test_backend.py index ca4f90ed5d94be..39b9788d720af3 100644 --- a/src/frontends/onnx/tests/tests_python/test_backend.py +++ b/src/frontends/onnx/tests/tests_python/test_backend.py @@ -57,6 +57,7 @@ xfail_issue_113506, skip_dynamic_model, xfail_issue_119896, + skip_issue_119896, xfail_issue_119900, xfail_issue_119903, xfail_issue_119906, @@ -246,7 +247,11 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None "OnnxBackendNodeModelTest.test_maxunpool_export_with_output_shape_cpu", "OnnxBackendNodeModelTest.test_maxunpool_export_without_output_shape_cpu", ), - (xfail_issue_38724, "OnnxBackendNodeModelTest.test_resize_tf_crop_and_resize_cpu"), + ( + xfail_issue_38724, + "OnnxBackendNodeModelTest.test_resize_tf_crop_and_resize_cpu", + "OnnxBackendNodeModelTest.test_resize_tf_crop_and_resize_extrapolation_value_cpu" + ), ( xfail_issue_33606, "OnnxBackendNodeModelTest.test_det_2d_cpu", @@ -454,6 +459,7 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None "OnnxBackendNodeModelTest.test_resize_upsample_sizes_nearest_axes_2_3_cpu", "OnnxBackendNodeModelTest.test_resize_upsample_sizes_nearest_axes_3_2_cpu", "OnnxBackendNodeModelTest.test_resize_upsample_sizes_nearest_not_larger_cpu", + "OnnxBackendNodeModelTest.test_resize_upsample_sizes_nearest_not_smaller_cpu", ), ( xfail_issue_99970, @@ -520,6 +526,13 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None "OnnxBackendNodeModelTest.test_dequantizelinear_e4m3fn_float16_cpu", "OnnxBackendNodeModelTest.test_dequantizelinear_e4m3fn_zero_point_cpu", ), + ( + skip_issue_119896, + "OnnxBackendNodeModelTest.test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FN_cpu", + "OnnxBackendNodeModelTest.test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2_cpu", + "OnnxBackendNodeModelTest.test_cast_no_saturate_FLOAT_to_FLOAT8E4M3FN_cpu", + "OnnxBackendNodeModelTest.test_cast_no_saturate_FLOAT_to_FLOAT8E5M2_cpu", + ), ( xfail_issue_119900, "OnnxBackendNodeModelTest.test_resize_downsample_scales_linear_half_pixel_symmetric_cpu", @@ -626,6 +639,7 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None skip_misalignment, "OnnxBackendNodeModelTest.test_gelu_default_2_expanded_cpu", "OnnxBackendNodeModelTest.test_reduce_log_sum_exp_empty_set_expanded_cpu", + "OnnxBackendNodeModelTest.test_reduce_max_empty_set_cpu", "OnnxBackendNodeModelTest.test_group_normalization_epsilon_cpu", "OnnxBackendNodeModelTest.test_group_normalization_example_cpu", "OnnxBackendNodeModelTest.test_qlinearmatmul_3D_int8_float16_cpu", diff --git a/thirdparty/onnx/onnx b/thirdparty/onnx/onnx index 990217f043af72..b8baa844668649 160000 --- a/thirdparty/onnx/onnx +++ b/thirdparty/onnx/onnx @@ -1 +1 @@ -Subproject commit 990217f043af7222348ca8f0301e17fa7b841781 +Subproject commit b8baa8446686496da4cc8fda09f2b6fe65c2a02c diff --git a/tools/constraints.txt b/tools/constraints.txt index b19b18fc844de4..b86029feaab4d4 100644 --- a/tools/constraints.txt +++ b/tools/constraints.txt @@ -3,7 +3,7 @@ # files because the version differs between them: # tensorflow, numpy h5py>=3.1.0,<3.11.0 -onnx>=1.8.1,<=1.16.0 +onnx>=1.8.1,<=1.17.0 networkx<=3.1.0 pytest>=5.0,<8.4 protobuf>=3.18.1,<4.0.0 From e279492be8ec67606993ac83684289a1bbf2f503 Mon Sep 17 00:00:00 2001 From: Tingqian Li Date: Fri, 25 Oct 2024 19:06:55 +0800 Subject: [PATCH 031/233] [CPU] Add per-token asym INT8 dynamic quantization support to QKV/MLP node (#27001) ### Details: - to use AMX-INT8 to boost performance of QKV/MLP layers in LLM, we need dynamic per-token INT8 quantization according to many research papers (SmoothQuant for example) and reference implementations (xFT for example), here we add the support, when following conditions are met: - platform support AMX-INT8 - QKV&MLP weights are symmetrically per-OC quantized as INT8 - add AMX-FP16 support to QKV/MLP layers on GNR platform. - optimize single batch 2nd token special case. ### Tickets: - *ticket-id* --------- Signed-off-by: Luo Cheng Co-authored-by: Luo Cheng --- src/plugins/intel_cpu/CMakeLists.txt | 2 +- src/plugins/intel_cpu/src/graph.cpp | 4 +- .../src/nodes/kernels/x64/mlp_kernel.cpp | 383 +++++++++++++---- .../src/nodes/kernels/x64/mlp_kernel.hpp | 359 ++++++++++++---- .../src/nodes/kernels/x64/mlp_utils.cpp | 178 +++++++- .../src/nodes/kernels/x64/mlp_utils.hpp | 35 +- src/plugins/intel_cpu/src/nodes/llm_mlp.cpp | 386 ++++++++++++++---- src/plugins/intel_cpu/src/nodes/llm_mlp.h | 17 +- src/plugins/intel_cpu/src/nodes/qkv_proj.cpp | 297 ++++++++++---- src/plugins/intel_cpu/src/nodes/qkv_proj.h | 20 +- .../cpu_opset/x64/op/llm_mlp.cpp | 11 +- .../cpu_opset/x64/op/llm_mlp.hpp | 5 + .../cpu_opset/x64/op/qkv_proj.cpp | 11 +- .../cpu_opset/x64/op/qkv_proj.hpp | 23 +- .../cpu_opset/x64/pass/mlp_fusion.cpp | 149 ++++++- .../cpu_opset/x64/pass/qkv_proj_fusion.cpp | 181 +++++++- .../cpu_opset/x64/pass/qkv_proj_fusion.hpp | 6 + .../transformation_pipeline.cpp | 26 +- .../subgraph_tests/src/x64/mlp_fusion.cpp | 49 ++- .../src/x64/qkv_proj_fusion.cpp | 55 ++- 20 files changed, 1787 insertions(+), 410 deletions(-) diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 3f349bebb4db82..6965b7a25ce512 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -326,7 +326,7 @@ cross_compiled_file(${TARGET_NAME} ARCH AVX512F ANY src/nodes/kernels/x64/mlp_utils.cpp API src/nodes/kernels/x64/mlp_utils.hpp - NAME llm_mlp_transpose_epi32_16x16 + NAME llm_mlp_transpose_epi32_16x16 llm_mlp_quantize_bf16_i8 llm_mlp_quantize_f16_i8 llm_mlp_dequantize_i32_f32 NAMESPACE ov::Extensions::Cpu::XARCH ) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index fa3502468a9cf3..45118763a3eaf9 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -1688,7 +1688,9 @@ void Graph::EnforceInferencePrecision() { Type::MatMul, // bert nets Type::ROIPooling, // object detection nets Type::Interpolate, // super resolution nets - Type::PagedAttention))// page attention + Type::PagedAttention, // page attention + Type::QKVProjection, + Type::LLMMLP)) continue; // stop at significant nodes } else if (inferPrec == ov::element::f16) { /* list of node types that must be forced to be executed in FP16 precision diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.cpp index 24528e7c5afc3d..fd00fca8431ff0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.cpp @@ -5,6 +5,7 @@ #include "mlp_kernel.hpp" #include "emitters/plugin/x64/jit_dnnl_emitters.hpp" #include "mlp_utils.hpp" +#include "openvino/core/parallel.hpp" using namespace dnnl::impl; using namespace dnnl::impl::utils; @@ -16,7 +17,7 @@ using TileConfiger = ov::Extensions::Cpu::TileConfiger; namespace ov { namespace intel_cpu { -void MKernel::generate() { +void MKernel::generate_2x2() { Xbyak::Reg64 reg_A_addr = abi_param2; Xbyak::Reg64 reg_A_stride = abi_param3; Xbyak::Reg64 reg_B_addr = abi_param4; @@ -95,27 +96,27 @@ void MKernel::generate() { tileloadd(tmmB0, ptr[reg_B_addr + reg_B_stride]); lea(reg_B_addr, ptr[reg_B_addr + 1024]); - tdpbf16ps(tmmC00, tmmA0, tmmB0); + tmul(tmmC00, tmmA0, tmmB0); if (cur_PFB < num_PFB) { prefetcht2(ptr[reg_prefetch + cur_PFB * 64]); cur_PFB++; } tileloadd(tmmA1, ptr[reg_A1_addr + reg_A_stride]); - tdpbf16ps(tmmC10, tmmA1, tmmB0); + tmul(tmmC10, tmmA1, tmmB0); if (cur_PFB < num_PFB) { prefetcht2(ptr[reg_prefetch + cur_PFB * 64]); cur_PFB++; } tileloadd(tmmB1, ptr[reg_B_addr + reg_B_stride]); - tdpbf16ps(tmmC01, tmmA0, tmmB1); + tmul(tmmC01, tmmA0, tmmB1); if (cur_PFB < num_PFB) { prefetcht2(ptr[reg_prefetch + cur_PFB * 64]); cur_PFB++; } - tdpbf16ps(tmmC11, tmmA1, tmmB1); + tmul(tmmC11, tmmA1, tmmB1); if (cur_PFB < num_PFB) { for (int pi = cur_PFB; pi < num_PFB; pi++) { prefetcht2(ptr[reg_prefetch + pi * 64]); @@ -168,6 +169,93 @@ void MKernel::tile_config_M(TileConfig& tile_cfg, int M) { }); } +void MKernel::generate_1x2() { + Xbyak::Reg64 reg_A_addr = abi_param2; + Xbyak::Reg64 reg_A_stride = abi_param3; + Xbyak::Reg64 reg_B_addr = abi_param4; + Xbyak::Reg64 reg_C_addr = abi_param5; + Xbyak::Reg64 reg_C_stride = abi_param6; + + Xbyak::Reg64 reg_ktiles = rax; + Xbyak::Reg64 reg_B_stride = r10; + // Xbyak::Reg64 reg_prefetch = r12; + + Xbyak::Tmm tmmC00 = tmm0; + Xbyak::Tmm tmmC01 = tmm1; + Xbyak::Tmm tmmA0 = tmm4; + Xbyak::Tmm tmmB0 = tmm6; + Xbyak::Tmm tmmB1 = tmm7; + + Xbyak::Label loop_over_ktiles; + Xbyak::Label skip_load; + + { + auto reg_tmp = reg_B_stride; + + mov(reg_A_addr, ptr[abi_param1 + offsetof(call_args, pA)]); + mov(reg_A_stride, ptr[abi_param1 + offsetof(call_args, strideA)]); + mov(reg_B_addr, ptr[abi_param1 + offsetof(call_args, pB)]); + mov(reg_C_addr, ptr[abi_param1 + offsetof(call_args, pC)]); + mov(reg_C_stride, ptr[abi_param1 + offsetof(call_args, strideC)]); + mov(reg_ktiles, ptr[abi_param1 + offsetof(call_args, k_tiles)]); + + mov(reg_tmp, ptr[abi_param1 + offsetof(call_args, do_accumulation)]); + // new: bit0: 0-skip load from mem, 1-load from mem; bit1: 0-skip tilezero, 1-tilezero; bit2: 0-skip store, 1-store + mov(abi_param1, reg_tmp); + and_(reg_tmp, 1); + jz(skip_load); + { + tileloadd(tmmC00, ptr[reg_C_addr + reg_C_stride]); + tileloadd(tmmC01, ptr[reg_C_addr + reg_C_stride + 64]); + } + L(skip_load); + mov(reg_tmp, abi_param1); + and_(reg_tmp, 2); + Xbyak::Label skip_zero; + jz(skip_zero); + { + tilezero(tmmC00); + tilezero(tmmC01); + } + L(skip_zero); + } + + mov(reg_B_stride, 64); + + auto const_A_steps = 64; + + align(64, false); + L(loop_over_ktiles); + { + // B: 1x2 tiles + // A : 2x1 tiles C: 2x2 tiles + tileloadd(tmmA0, ptr[reg_A_addr + reg_A_stride]); + tileloadd(tmmB0, ptr[reg_B_addr + reg_B_stride]); + lea(reg_B_addr, ptr[reg_B_addr + 1024]); + + tmul(tmmC00, tmmA0, tmmB0); + + tileloadd(tmmB1, ptr[reg_B_addr + reg_B_stride]); + tmul(tmmC01, tmmA0, tmmB1); + + lea(reg_A_addr, ptr[reg_A_addr + const_A_steps]); + lea(reg_B_addr, ptr[reg_B_addr + 1024]); + } + dec(reg_ktiles); + jnz(loop_over_ktiles, T_NEAR); + + and_(abi_param1, 4); + Xbyak::Label skip_store; + jz(skip_store); + { + tilestored(ptr[reg_C_addr + reg_C_stride], tmmC00); + tilestored(ptr[reg_C_addr + reg_C_stride + 64], tmmC01); + } + L(skip_store); + + ret(); +} + class FP16ToBF16Kernel : public dnnl::impl::cpu::x64::jit_generator { public: DECLARE_CPU_JIT_AUX_FUNCTIONS(FP16ToBF16Kernel) @@ -190,95 +278,186 @@ class FP16ToBF16Kernel : public dnnl::impl::cpu::x64::jit_generator { } }; -template -void MKernel::repackB(ov::bfloat16* dst, T* src, int N_stride, int N, int K) { +template +static typename std::enable_if::value || std::is_same::value>::type +repackB(Tdst* dst, ov::float16* src, int N_stride, int N, int K) { static FP16ToBF16Kernel fp16_to_bf16; - - if (N == 16 && K == 32 && (std::is_same::value || std::is_same::value)) { + if (N == 16 && K == 32) { // SIMD optimized version - ov::Extensions::Cpu::XARCH::llm_mlp_transpose_epi32_16x16(dst, src, N_stride * sizeof(T)); - if (std::is_same::value) { + ov::Extensions::Cpu::XARCH::llm_mlp_transpose_epi32_16x16(dst, src, N_stride * sizeof(Tdst)); + if (std::is_same::value) fp16_to_bf16(dst); - } return; } assert(K <= 32); assert(N <= 16); int k = 0; - ov::bfloat16 bf16zero(0.0f); + Tdst zero(0.0f); for (; k < 32; k += 2) { int n = 0; bool is_k0_valid = (k) < K; bool is_k1_valid = (k + 1) < K; auto* psrc = src + k; for (; n < 16 && n < N; n++, psrc += N_stride) { - *dst++ = is_k0_valid ? ov::bfloat16(psrc[0]) : bf16zero; - *dst++ = is_k1_valid ? ov::bfloat16(psrc[1]) : bf16zero; + *dst++ = is_k0_valid ? static_cast(psrc[0]) : zero; + *dst++ = is_k1_valid ? static_cast(psrc[1]) : zero; + } + for (; n < 16; n++) { + *dst++ = 0; + *dst++ = 0; + } + } +} + +static void repackB(int8_t* dst, int8_t* src, int N_stride, int N, int K) { + if (N == 16 && K == 64) { + // SIMD optimized version + ov::Extensions::Cpu::XARCH::llm_mlp_transpose_epi32_16x16(dst, src, N_stride * sizeof(int8_t)); + return; + } + + assert(K <= 64); + assert(N <= 16); + for (int k = 0; k < 64; k += 4) { + bool is_k0_valid = (k) < K; + bool is_k1_valid = (k + 1) < K; + bool is_k2_valid = (k + 2) < K; + bool is_k3_valid = (k + 3) < K; + auto* psrc = src + k; + int n = 0; + for (; n < 16 && n < N; n++, psrc += N_stride) { + *dst++ = is_k0_valid ? psrc[0] : 0; + *dst++ = is_k1_valid ? psrc[1] : 0; + *dst++ = is_k2_valid ? psrc[2] : 0; + *dst++ = is_k3_valid ? psrc[3] : 0; } for (; n < 16; n++) { *dst++ = 0; *dst++ = 0; + *dst++ = 0; + *dst++ = 0; } } } -template -void MKernel::prepareB(PlainTensor& ret, ov::bfloat16* dst, T* p_weight, int stride, int N, int K) { +template +void MKernel::BMatrix::setup(Tdst* ext_buff, ov::float16* p_weight, int weight_stride_in_bytes, int N, int K) { OPENVINO_ASSERT((N % 32) == 0); OPENVINO_ASSERT((K % 32) == 0); - // weight matrix is in unit of [N/32, Kx32] - ret.resize({static_cast(N / 32), static_cast(K * 32)}, dst); - - auto N_stride = stride / sizeof(T); - for (int n = 0, blkn = 0; n < N; n += 32, blkn++) { - auto* dst_base = ret.ptr(blkn, 0); - for (int k = 0, blkk = 0; k < K; k += 32, blkk++, dst_base += 1024) { - // two adjacent 32x16 (512) block of weight: dst0 & dst1 - auto* dst0 = dst_base; - auto* dst1 = dst0 + 16 * 32; - auto valid_k = (K - k) < 32 ? (K - k) : 32; - - auto* src0 = p_weight + n * N_stride + k; - auto valid_n0 = (N - n) < 16 ? (N - n) : 16; - repackB(dst0, src0, N_stride, valid_n0, valid_k); - - auto* src1 = p_weight + (n + 16) * N_stride + k; - auto valid_n1 = (N - (n + 16)) < 16 ? (N - (n + 16)) : 16; - repackB(dst1, src1, N_stride, valid_n1, valid_k); + + this->ptr = reinterpret_cast(ext_buff); + this->Bpair_rows = K/32; + this->Bpair_cols = N/32; + + const int k_step = 32; + auto N_stride = weight_stride_in_bytes / sizeof(Tdst); + auto* pdst = reinterpret_cast(ext_buff); + for (int n = 0; n < N; n += 32) { + auto* src0 = p_weight + n * N_stride; + auto* src1 = p_weight + (n + 16) * N_stride; + auto valid_n0 = std::min((N - n), 16); + auto valid_n1 = std::min((N - (n + 16)), 16); + for (int k = 0, blkk = 0; k < K; k += k_step, blkk++) { + auto valid_k = std::min((K - k), k_step); + repackB(reinterpret_cast(pdst), src0 + k, N_stride, valid_n0, valid_k); + pdst += 1024; + repackB(reinterpret_cast(pdst), src1 + k, N_stride, valid_n1, valid_k); + pdst += 1024; + } + } +} + +template void MKernel::BMatrix::setup(ov::bfloat16*, ov::float16*, int, int, int); +template void MKernel::BMatrix::setup(ov::float16*, ov::float16*, int, int, int); + +void MKernel::BMatrix::setup(int8_t* ext_buff, int8_t* p_weight, int weight_stride_in_bytes, int N, int K) { + OPENVINO_ASSERT((N % 32) == 0); + OPENVINO_ASSERT((K % 64) == 0); + + this->ptr = reinterpret_cast(ext_buff); + this->Bpair_rows = K/64; + this->Bpair_cols = N/32; + + const int k_step = 64; + auto N_stride = weight_stride_in_bytes / sizeof(int8_t); + auto* pdst = reinterpret_cast(ext_buff); + for (int n = 0; n < N; n += 32) { + auto* src0 = p_weight + n * N_stride; + auto* src1 = p_weight + (n + 16) * N_stride; + auto valid_n0 = std::min((N - n), 16); + auto valid_n1 = std::min((N - (n + 16)), 16); + for (int k = 0, blkk = 0; k < K; k += k_step, blkk++) { + auto valid_k = std::min((K - k), k_step); + repackB(reinterpret_cast(pdst), src0 + k, N_stride, valid_n0, valid_k); + pdst += 1024; + repackB(reinterpret_cast(pdst), src1 + k, N_stride, valid_n1, valid_k); + pdst += 1024; } } } // interleaving two weights into one in unit of 16-column -template -void MKernel::prepareB(PlainTensor& ret, ov::bfloat16* dst, T* p_weight1, T* p_weight2, int stride, int N, int K) { +template +void MKernel::BMatrix::setup(Tdst* ext_buff, + ov::float16* p_weight_B0, + ov::float16* p_weight_B1, + int weight_stride_in_bytes, + int N, + int K) { OPENVINO_ASSERT((N % 32) == 0); OPENVINO_ASSERT((K % 32) == 0); - // weight matrix is in unit of [N/32, Kx32] - ret.resize({static_cast(N / 32), static_cast(K * 32)}, dst); - auto N_stride = stride / sizeof(T); + this->ptr = reinterpret_cast(ext_buff); + this->Bpair_rows = K / 32; + this->Bpair_cols = N / 32; + + const int k_step = 32; + auto N_stride = weight_stride_in_bytes / sizeof(Tdst); auto N2 = N / 2; - for (int n = 0, blkn = 0; n < N2; n += 16, blkn++) { - for (int k = 0, blkk = 0; k < K; k += 32, blkk++) { - // two adjacent 32x16 (512) block of weight: dst0 & dst1 - auto* dst0 = ret.ptr(blkn, blkk * 1024); - auto* dst1 = dst0 + 16 * 32; - auto valid_k = (K - k) < 32 ? (K - k) : 32; - - auto* src0 = p_weight1 + n * N_stride + k; - auto valid_n0 = (N2 - n) < 16 ? (N2 - n) : 16; - repackB(dst0, src0, N_stride, valid_n0, valid_k); - - auto* src1 = p_weight2 + n * N_stride + k; - repackB(dst1, src1, N_stride, valid_n0, valid_k); + auto* pdst = reinterpret_cast(ext_buff); + for (int n = 0; n < N2; n += 16) { + auto valid_n0 = std::min((N2 - n), 16); + for (int k = 0; k < K; k += k_step) { + auto valid_k = std::min((K - k), k_step); + repackB(reinterpret_cast(pdst), p_weight_B0 + n * N_stride + k, N_stride, valid_n0, valid_k); + pdst += 1024; + repackB(reinterpret_cast(pdst), p_weight_B1 + n * N_stride + k, N_stride, valid_n0, valid_k); + pdst += 1024; } } } +template void MKernel::BMatrix::setup(ov::bfloat16*, ov::float16*, ov::float16*, int, int, int); +template void MKernel::BMatrix::setup(ov::float16*, ov::float16*, ov::float16*, int, int, int); + +void MKernel::BMatrix::setup(int8_t* ext_buff, + int8_t* p_weight_B0, + int8_t* p_weight_B1, + int weight_stride_in_bytes, + int N, + int K) { + OPENVINO_ASSERT((N % 32) == 0); + OPENVINO_ASSERT((K % 64) == 0); -template void MKernel::prepareB(PlainTensor& ret, ov::bfloat16* dst, ov::float16* p_weight, int stride, int N, int K); -template void MKernel::prepareB(PlainTensor& ret, ov::bfloat16* dst, ov::float16* p_weight1, ov::float16* p_weight2, int stride, int N, int K); + this->ptr = reinterpret_cast(ext_buff); + this->Bpair_rows = K / 64; + this->Bpair_cols = N / 32; + + const int k_step = 64; + auto N_stride = weight_stride_in_bytes / sizeof(int8_t); + auto N2 = N / 2; + auto* pdst = reinterpret_cast(ext_buff); + for (int n = 0; n < N2; n += 16) { + auto valid_n0 = std::min((N2 - n), 16); + for (int k = 0; k < K; k += k_step) { + auto valid_k = std::min((K - k), k_step); + repackB(reinterpret_cast(pdst), p_weight_B0 + n * N_stride + k, N_stride, valid_n0, valid_k); + pdst += 1024; + repackB(reinterpret_cast(pdst), p_weight_B1 + n * N_stride + k, N_stride, valid_n0, valid_k); + pdst += 1024; + } + } +} // run L2 cache blocking kernel with size: // [BM, BK]*[BK, BN] => [BM, BN] @@ -288,26 +467,24 @@ template void MKernel::prepareB(PlainTensor& ret, ov::bfloat16* dst // but prefetch of next B must be specified by caller. // void MKernel::run(int M, // actual M - uint8_t* pA, - int strideA, // A [M, K] - PlainTensor& repacked_B, // B [N/32, K*32] ov::bfloat16 - uint8_t* pC, - int strideC, // C [M, N] - uint8_t* prefetch_B, // prefetch B - bool do_accumulation) { + uint8_t* pA, + int strideA, // A [M, K] + BMatrix& repacked_B, // B [N/32, K*32] ov::bfloat16 + uint8_t* pC, + int strideC, // C [M, N] + uint8_t* prefetch_B, // prefetch B + bool do_accumulation) { call_args args; - // number of blocks in N dimension (in unit of 32 columns) - auto num_blkN = static_cast(repacked_B.size(0)); - auto K = repacked_B.size(1) / 32; - auto* pB = repacked_B.ptr(); - auto strideB = repacked_B.stride_bytes(0); + + auto* pB = repacked_B.ptr; + auto strideB = repacked_B.Bpair_rows * repacked_B.Bpair_size; + auto num_blkN = repacked_B.Bpair_cols; args.do_accumulation = do_accumulation; - args.k_tiles = K / 32; + args.k_tiles = repacked_B.Bpair_rows; args.strideA = strideA; args.strideC = strideC; args.prefetch = prefetch_B; - assert((K % 32) == 0); auto prefetch_step = m_prefetch_Blines * 64 * args.k_tiles; @@ -316,13 +493,47 @@ void MKernel::run(int M, // actual M args.pB = pB; args.M = std::min(M - m, 32); args.pA = pA; - for (int ni = 0; ni < num_blkN; ni++, args.pB += strideB, args.prefetch += prefetch_step) { + for (size_t ni = 0; ni < num_blkN; ni++, args.pB += strideB, args.prefetch += prefetch_step) { args.pC = pC + ni * 32 * sizeof(float); (*this)(&args); } } } +void MatrixDynQuantPerRow::quantize(size_t BM, ov::bfloat16* psrc, int src_stride) { + assert(BM <= M); + parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + size_t start{0}, end{0}; + splitter(BM, nthr, ithr, start, end); + ov::Extensions::Cpu::XARCH::llm_mlp_quantize_bf16_i8(psrc + start * src_stride, + src_stride, + data + start * K, + K, + end - start, + K, + scale + start, + zp + start, + asym); + }); +} + +void MatrixDynQuantPerRow::quantize(size_t BM, ov::float16* psrc, int src_stride) { + assert(BM <= M); + parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + size_t start{0}, end{0}; + splitter(BM, nthr, ithr, start, end); + ov::Extensions::Cpu::XARCH::llm_mlp_quantize_f16_i8(psrc + start * src_stride, + src_stride, + data + start * K, + K, + end - start, + K, + scale + start, + zp + start, + asym); + }); +} + void GateUpCombine::generate() { Xbyak::Label loop_begin; @@ -365,7 +576,11 @@ void GateUpCombine::generate() { injector->compute_vector(zmm_silu.getIdx()); vmovups(zmm_up, ptr[src + loop_i * 8 + 16 * 4]); vmulps(zmm_up, zmm_up, zmm_silu); - vcvtneps2bf16(ymm_dst, zmm_up); + if (m_to_f16) { + vcvtps2ph(ymm_dst, zmm_up, 0x4); + } else { + vcvtneps2bf16(ymm_dst, zmm_up); + } prefetchwt1(ptr[prefetch_dst + loop_i * 2]); vmovdqu(ptr[dst + loop_i * 2], ymm_dst); } @@ -401,9 +616,15 @@ void ReduceAdd2bh::generate() { vmovups(zmm3, ptr[src1 + loop_i * 4 + 16 * 4]); vaddps(zmm0, zmm0, zmm1); vaddps(zmm2, zmm2, zmm3); - vcvtne2ps2bf16(zmm4, zmm2, zmm0); - prefetchwt1(ptr[prefetch_dst + loop_i * 2]); - vmovups(ptr[dst + loop_i * 2], zmm4); + if (m_to_f16) { + vcvtps2ph(ptr[dst + loop_i * 2], zmm0, 0x4); + vcvtps2ph(ptr[dst + loop_i * 2 + 32], zmm2, 0x4); + prefetchwt1(ptr[prefetch_dst + loop_i * 2]); + } else { + vcvtne2ps2bf16(zmm4, zmm2, zmm0); + prefetchwt1(ptr[prefetch_dst + loop_i * 2]); + vmovups(ptr[dst + loop_i * 2], zmm4); + } } add(loop_i, 32); cmp(loop_i, BN); @@ -426,9 +647,15 @@ void ReduceAdd2bh::generate() { { vmovups(zmm0, ptr[src0 + loop_i * 4]); vmovups(zmm2, ptr[src0 + loop_i * 4 + 16 * 4]); - vcvtne2ps2bf16(zmm4, zmm2, zmm0); - prefetchwt1(ptr[prefetch_dst + loop_i * 2]); - vmovups(ptr[dst + loop_i * 2], zmm4); + if (m_to_f16) { + vcvtps2ph(ptr[dst + loop_i * 2], zmm0, 0x4); + vcvtps2ph(ptr[dst + loop_i * 2 + 32], zmm2, 0x4); + prefetchwt1(ptr[prefetch_dst + loop_i * 2]); + } else { + vcvtne2ps2bf16(zmm4, zmm2, zmm0); + prefetchwt1(ptr[prefetch_dst + loop_i * 2]); + vmovups(ptr[dst + loop_i * 2], zmm4); + } } add(loop_i, 32); cmp(loop_i, BN); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp index 1e9634b0d56332..26b6c1b0cc17a2 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp @@ -11,6 +11,7 @@ // register blocking size for K dimension (1x2 AMX B-tiles) #define REG_BLK_K_SIZE 32 +#define REG_BLK_K_SIZE_I8 64 // register blocking size for N dimension (1x2 AMX B-tiles) #define REG_BLK_N_SIZE 32 @@ -42,23 +43,67 @@ class AutoTileConfiger { void* last_cfg = nullptr; }; +enum class TMUL_TYPE { SSD = 1, USD = 2, SUD = 3, UUD = 4, FP16 = 5, BF16 = 6 }; + class MKernel : public dnnl::impl::cpu::x64::jit_generator { public: DECLARE_CPU_JIT_AUX_FUNCTIONS(MKernel) int m_prefetch_Blines; - - MKernel(int M_hint = 256) : jit_generator("MKernel") { + const TMUL_TYPE m_tmul_type; + int m_tile_reg_ksize; + int m_M_hint; + + MKernel(int M_hint, TMUL_TYPE tmul_type) : jit_generator("MKernel"), m_tmul_type(tmul_type), m_M_hint(M_hint) { + if (m_tmul_type == TMUL_TYPE::FP16 || m_tmul_type == TMUL_TYPE::BF16) + m_tile_reg_ksize = 32; + else + m_tile_reg_ksize = 64; setup(M_hint); } - void generate() override; + void tmul(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) { + switch (m_tmul_type) { + case TMUL_TYPE::SSD: + tdpbssd(x1, x2, x3); + break; + case TMUL_TYPE::USD: + tdpbusd(x1, x2, x3); + break; + case TMUL_TYPE::SUD: + tdpbsud(x1, x2, x3); + break; + case TMUL_TYPE::UUD: + tdpbuud(x1, x2, x3); + break; + case TMUL_TYPE::FP16: + tdpfp16ps(x1, x2, x3); + break; + case TMUL_TYPE::BF16: + tdpbf16ps(x1, x2, x3); + break; + } + } + + void generate() override { + if (m_M_hint <= 16) { + generate_1x2(); + } else { + generate_2x2(); + } + } + + void generate_2x2(); + void generate_1x2(); // M_hint is only a hint for prefetching, set to 0 to avoid prefetch void setup(int M_hint = 0) { if (M_hint == 0) { m_prefetch_Blines = 0; } else { + // next block size: 32 * N * sizeof(ov::bfloat16), + // call number: N / 32 * M / 32 + // each call needs fetch: 32 * N * sizeof(ov::bfloat16) / (N / 32 * M / 32) = 32 * 1024 * sizeof(ov::bfloat16) / M m_prefetch_Blines = 32768 * sizeof(ov::bfloat16) / 64 / M_hint; } @@ -71,26 +116,14 @@ class MKernel : public dnnl::impl::cpu::x64::jit_generator { // - tile config controls behaviour of tileload & TMUL void tile_config_M(ov::Extensions::Cpu::TileConfig& tile_cfg, int M); - // row data is in layout [N, K], maybe smaller than [32, 16] - template - void repackB(ov::bfloat16* dst, T* src, int N_stride, int N, int K); - - // weight is supposed to be of shape[N, K], stride in unit of bytes - template - void prepareB(PlainTensor& ret, ov::bfloat16* dst, T* p_weight, int stride, int N, int K); - - // interleaving two weights into one in unit of 16-column - template - void prepareB(PlainTensor& ret, ov::bfloat16* dst, T* p_weight1, T* p_weight2, int stride, int N, int K); - // to save push/pop: do not use `abi_save_gpr_regs` uint8_t* prefetch_next_A_addr; struct call_args { - const uint8_t* pA; // bfloat16 + const uint8_t* pA; // bfloat16/int8 int64_t strideA; // in bytes - const uint8_t* pB; // bfloat16 - const uint8_t* pC; // float32 + const uint8_t* pB; // bfloat16/int8 + const uint8_t* pC; // float32/int32 int64_t strideC; // in bytes const uint8_t* prefetch; int64_t k_tiles; // K / 32 @@ -98,6 +131,27 @@ class MKernel : public dnnl::impl::cpu::x64::jit_generator { int64_t M; }; + // each 32x16(64x16) sub-matrix in [K, N]-shaped BMatrix to be loaded as B-tile is packed into AMX B-tile layout + // and two neighboring B-tiles in same row are grouped as a pair (B0-B1), and all such pairs are arranged in [nN, nK] shape + struct BMatrix { + uint8_t * ptr; + // Bpair is two 1KB sub-matrixes repacked in AMX-Btile layout + const size_t Bpair_size = 2048; + size_t Bpair_rows; + size_t Bpair_cols; + + // convert + template + void setup(Tdst* ext_buff, ov::float16* p_weight, int stride, int N, int K); + + void setup(int8_t* ext_buff, int8_t* p_weight, int stride, int N, int K); + // two B tiles in each pair (B0 & B1) comes from different raw weight matrix + template + void setup(Tdst* ext_buff, ov::float16* p_weight_B0, ov::float16* p_weight_B1, int stride, int N, int K); + + void setup(int8_t* ext_buff, int8_t* p_weight_B0, int8_t* p_weight_B1, int stride, int N, int K); + }; + // run L2 cache blocking kernel with size: // [BM, BK]*[BK, BN] => [BM, BN] // @@ -107,17 +161,19 @@ class MKernel : public dnnl::impl::cpu::x64::jit_generator { // void run(int M, // actual M uint8_t* pA, - int strideA, // A [M, K] - PlainTensor& repacked_B, // B [N/32, K*32] ov::bfloat16 + int strideA, // A [M, K] + BMatrix& repacked_B, // B uint8_t* pC, int strideC, // C [M, N] uint8_t* prefetch_B, // prefetch B bool do_accumulation); }; - struct Work { - std::vector weights; // ov::bfloat16 weights for current thread + std::vector weights; + + // will be used only when activation is being quantized asymmetrically + PlainTensor w_sum_per_oc; std::shared_ptr sync_flag; int n0 = 0; @@ -127,29 +183,57 @@ struct Work { int BN = 0; int blk_K_size = 0; int output_id; - ov::float16* p_raw_weights; + void* p_raw_weights; operator bool() { return BN > 0; } + bool quant_i8 = false; + bool is_f16 = false; + MKernel& get_MKernel() { constexpr int BM = 256; - static MKernel jit_amx0(BM); - return jit_amx0; + static MKernel jit_amx_bf16(BM, TMUL_TYPE::BF16); + static MKernel jit_amx_f16(BM, TMUL_TYPE::FP16); + static MKernel jit_amx_i8(BM, TMUL_TYPE::SSD); + if (quant_i8) return jit_amx_i8; + if (is_f16) return jit_amx_f16; + return jit_amx_bf16; + } + + MKernel& get_MKernel_1x2() { + static MKernel jit_amx_bf16(16, TMUL_TYPE::BF16); + static MKernel jit_amx_f16(16, TMUL_TYPE::FP16); + static MKernel jit_amx_i8(16, TMUL_TYPE::SSD); + if (quant_i8) return jit_amx_i8; + if (is_f16) return jit_amx_f16; + return jit_amx_bf16; } // input : weight [N, K], setup repacks range of N [n_start, n_end) - template - void setup(ov::bfloat16* dst, T* p_weight, int stride) { + template + void setup(Tdst* dst, Tsrc* p_weight, int stride_in_bytes, bool do_sum_per_oc = false) { auto& mkernel = get_MKernel(); auto num_blk_K = (k1 - k0 + blk_K_size - 1) / blk_K_size; - auto* pw = p_weight + n0 * stride / sizeof(T); + auto* pw = p_weight + n0 * stride_in_bytes / sizeof(Tsrc); + + if (do_sum_per_oc) { + w_sum_per_oc.resize({static_cast(n1 - n0)}); + auto * p_wsum_per_oc = w_sum_per_oc.ptr(); + auto* pw_temp = pw; + for (int n = n0; n < n1; n++, pw_temp += stride_in_bytes / sizeof(Tsrc)) { + float fsum = 0; + for (int k = k0; k < k1; k++) + fsum += pw_temp[k]; + *p_wsum_per_oc++ = fsum; + } + } // weight is divided along K dimension into equal size blk_K_size, except last block. weights.resize(num_blk_K); for (int k = k0, ki = 0; k < k1;) { auto subK = std::min(blk_K_size, k1 - k); - mkernel.prepareB(weights[ki], dst, pw + k, stride, BN, subK); + weights[ki].setup(dst, pw + k, stride_in_bytes, BN, subK); dst += BN*subK; k += subK; ki++; @@ -160,18 +244,40 @@ struct Work { } } - template - void setup(ov::bfloat16* dst, T* p_weight1, T* p_weight2, int stride) { + // two weight matrix interleaved in unit of B-tiles + // in each Bpair, p_weight1 stored in B0, p_weight2 stored in B1 + template + void setup(Tdst* dst, Tsrc* p_weight1, Tsrc* p_weight2, int stride_in_bytes, bool do_sum_per_oc = false) { auto& mkernel = get_MKernel(); auto num_blk_K = (k1 - k0 + blk_K_size - 1) / blk_K_size; - auto* pw1 = p_weight1 + (n0/2) * stride / sizeof(T); - auto* pw2 = p_weight2 + (n0/2) * stride / sizeof(T); + auto* pw1 = p_weight1 + (n0/2) * stride_in_bytes / sizeof(Tsrc); + auto* pw2 = p_weight2 + (n0/2) * stride_in_bytes / sizeof(Tsrc); + + if (do_sum_per_oc) { + w_sum_per_oc.resize({static_cast(n1 - n0)}); + auto * p_wsum_per_oc = w_sum_per_oc.ptr(); + auto* pw1_temp = pw1; + auto* pw2_temp = pw2; + auto stride_temp = stride_in_bytes / sizeof(Tsrc); + for (int n = n0; n < n1; n+=32) { + for (int dn = 0; dn < 16; dn++, pw1_temp += stride_temp) { + float fsum = 0; + for (int k = k0; k < k1; k++) fsum += pw1_temp[k]; + *p_wsum_per_oc++ = fsum; + } + for (int dn = 0; dn < 16; dn++, pw2_temp += stride_temp) { + float fsum = 0; + for (int k = k0; k < k1; k++) fsum += pw2_temp[k]; + *p_wsum_per_oc++ = fsum; + } + } + } // weight is divided along K dimension into equal size blk_K_size, except last block. weights.resize(num_blk_K); for (int k = k0, ki = 0; k < k1;) { auto subK = std::min(blk_K_size, k1 - k); - mkernel.prepareB(weights[ki], dst, pw1 + k, pw2 + k, stride, BN, subK); + weights[ki].setup(dst, pw1 + k, pw2 + k, stride_in_bytes, BN, subK); dst += BN*subK; k += subK; ki++; @@ -198,7 +304,7 @@ struct Work { void run(int M, uint8_t* pA, int strideA) { auto& mkernel = get_MKernel(); - int num_blk_K = weights.size(); + auto num_blk_K = weights.size(); auto Mtails = M % 32; auto Mbody = M - Mtails; @@ -208,36 +314,75 @@ struct Work { OPENVINO_ASSERT(C_M * C_stride_bytes <= m_C.stride_bytes(0) * m_C.size(0)); auto pC = reinterpret_cast(m_C.ptr_v()); - pA += k0 * sizeof(ov::bfloat16); - bool do_accumulation = false; - - for (int ki = 0; ki < num_blk_K; ki++) { - PlainTensor& blockB = weights[ki]; - PlainTensor& blockB1 = weights[(ki + 1) < num_blk_K ? (ki + 1) : ki]; - if (Mbody) { - m_tile_configer.do_config(&m_tcfg[0]); - mkernel.run(Mbody, - pA + ki * blk_K_size * sizeof(ov::bfloat16), - strideA, - blockB, - pC, - C_stride_bytes, - reinterpret_cast(blockB1.ptr_v()), - do_accumulation); + auto element_size = quant_i8 ? sizeof(int8_t) : sizeof(ov::bfloat16); + + pA += k0 * element_size; + + if (M > 16 || num_blk_K ==1) { + bool do_accumulation = false; + for (size_t ki = 0; ki < num_blk_K; ki++) { + auto& blockB = weights[ki]; + auto& blockB1 = weights[(ki + 1) < num_blk_K ? (ki + 1) : ki]; + if (Mbody) { + m_tile_configer.do_config(&m_tcfg[0]); + mkernel.run(Mbody, + pA + ki * blk_K_size * element_size, + strideA, + blockB, + pC, + C_stride_bytes, + blockB1.ptr, + do_accumulation); + } + + if (Mtails) { + m_tile_configer.do_config(&m_tcfg[Mtails]); + mkernel.run(Mtails, + pA + ki * blk_K_size * element_size + Mbody * strideA, + strideA, + blockB, + pC + Mbody * C_stride_bytes, + C_stride_bytes, + blockB1.ptr, + do_accumulation); + } + do_accumulation = true; } - - if (Mtails) { - m_tile_configer.do_config(&m_tcfg[Mtails]); - mkernel.run(Mtails, - pA + ki * blk_K_size * sizeof(ov::bfloat16) + Mbody * strideA, - strideA, - blockB, - pC + Mbody * C_stride_bytes, - C_stride_bytes, - reinterpret_cast(blockB1.ptr_v()), - do_accumulation); + } else { + auto& jit = get_MKernel_1x2(); + auto& blockB = weights[0]; + // number of blocks in N dimension (in unit of 32 columns) + auto num_blkN = blockB.Bpair_cols; + m_tile_configer.do_config(&m_tcfg[Mtails]); + // original: bit0: 0-tilezero+skip load from mem, 1-tilezero+load from mem; tilestore + // new: bit0: 0-skip load from mem, 1-load from mem; bit1: 0-skip tilezero, 1-tilezero; bit2: 0-skip store, 1-store + // if M > 32, firstK: 1 1 0(store, tilezero, skip load) + // the otherK except last: 1 0 1(store, skip tilezero, load) lastK: 1 0 1 + // else + // firstK: 0 1 0(skip store, tilezero, skip load), the otherK except last: 0 0 0(skip all), + // lastK: 1 0 0(store, skip tile zero, skip load) + int do_accumulation; + MKernel::call_args args; + args.strideA = strideA; + args.strideC = C_stride_bytes; + args.M = Mtails; + for (size_t ni = 0; ni < num_blkN; ni++) { + args.pC = pC + ni * 32 * sizeof(float); + do_accumulation = 0b010; + for (size_t ki = 0; ki < num_blk_K; ki++) { + auto& blockB = weights[ki]; + args.k_tiles = blockB.Bpair_rows; + args.pA = pA + ki * blk_K_size * element_size; + args.pB = blockB.ptr + ni * blockB.Bpair_rows * blockB.Bpair_size; + args.do_accumulation = do_accumulation; + // prefetch next N block. In memory bound, it seems no prefetch will be better. + // args.prefetch = args.pB + (ni == num_blkN - 1 ? 0 : strideB); + // args.prefetch = args.pB; + // [M, K] * [K, N]: [1..32, 256] * [256, 32] + jit(&args); + do_accumulation = (ki == num_blk_K - 2) ? 0b100 : 0; + } } - do_accumulation = true; } m_tile_configer.do_config(nullptr); } @@ -247,19 +392,75 @@ struct Work { struct WeightBuffer { PlainTensor buffer; std::vector offsets; - void alloc(std::vector& works) { - size_t weight_cnt = 0; + void alloc(std::vector& works, int element_size) { + size_t weight_size = 0; for (auto& work : works) { - offsets.push_back(weight_cnt); - weight_cnt += (work.n1 - work.n0) * (work.k1 - work.k0); + offsets.push_back(weight_size); + weight_size += (work.n1 - work.n0) * (work.k1 - work.k0) * element_size; } - buffer.resize({weight_cnt}); + buffer.resize({weight_size}); } - ov::bfloat16* get(int work_id) { - return buffer.ptr() + offsets[work_id]; + template + T* get(int work_id) { + return reinterpret_cast(buffer.ptr() + offsets[work_id]); } }; +struct ScratchBuffAllocator { + using CallBack = std::function; + std::vector m_allocs; + std::vector m_sizes; + size_t m_total_size = 0; + ScratchBuffAllocator() = default; + + // register size / allocate totally size / inform consumers + void register_allocation(size_t size, CallBack cb) { + m_allocs.push_back(cb); + m_total_size += size; + m_sizes.push_back(size); + } + size_t size() { + return m_total_size; + } + void finalize(void* base) { + auto* ptr = reinterpret_cast(base); + for (size_t i = 0; i < m_allocs.size(); i++) { + m_allocs[i](ptr); + ptr += m_sizes[i]; + } + } +}; + +struct MatrixDynQuantPerRow { + // M x K + int M; + int K; + int8_t * data; + float * scale; + float * zp; + bool asym = true; + + MatrixDynQuantPerRow() = default; + + size_t size() { + // size of data & scale & zp + return M*K + M*sizeof(float)*2; + } + + size_t stride() { + return K; + } + + void setup(void * ext_buf) { + data = reinterpret_cast(ext_buf); + scale = reinterpret_cast(data + M*K); + zp = reinterpret_cast(scale + M); + } + + void quantize(size_t BM, ov::bfloat16* psrc, int src_stride); + void quantize(size_t BM, ov::float16* psrc, int src_stride); +}; + // combine gate_proj & up_proj using activation algo, then convert to bf16 // ConvertFP32toBF16(act_fn(gate) * up) class GateUpCombine : public dnnl::impl::cpu::x64::jit_generator { @@ -267,13 +468,16 @@ class GateUpCombine : public dnnl::impl::cpu::x64::jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(GateUpCombine) const dnnl_alg_kind_t m_act_alg; - GateUpCombine(dnnl_alg_kind_t act_alg) : jit_generator(jit_name()), m_act_alg(act_alg) { + const bool m_to_f16; + + GateUpCombine(dnnl_alg_kind_t act_alg, bool to_f16) : jit_generator(jit_name()), m_act_alg(act_alg), m_to_f16(to_f16) { create_kernel(); } void generate() override; - void call(float* src, size_t src_stride, ov::bfloat16 * dst, size_t dst_stride, int num_rows, int num_cols) { + void call(float* src, size_t src_stride, void * pv_dst, size_t dst_stride, int num_rows, int num_cols) { + auto* dst = reinterpret_cast(pv_dst); for (int m = 0; m < num_rows; m++, src += src_stride, dst += dst_stride) { auto* prefetch_dst = (m + 1 < num_rows) ? (dst + dst_stride) : (dst); @@ -296,15 +500,17 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator { public: DECLARE_CPU_JIT_AUX_FUNCTIONS(ReduceAdd2bh) - bool m_do_reduce2; - ReduceAdd2bh(bool do_reduce2) : jit_generator(jit_name()), m_do_reduce2(do_reduce2) { + const bool m_do_reduce2; + const bool m_to_f16; + ReduceAdd2bh(bool do_reduce2, bool to_f16) : jit_generator(jit_name()), m_do_reduce2(do_reduce2), m_to_f16(to_f16) { create_kernel(); } void generate() override; // add two float input eltwise and convert to bf16 : ConvertFP32toBF16(src0 + src1) - void call(float * src0, float * src1, size_t src_stride, ov::bfloat16 * dst, size_t dst_stride, int num_rows, int num_cols) { + void call(float * src0, float * src1, size_t src_stride, void * pf16_dst, size_t dst_stride, int num_rows, int num_cols) { + auto* dst = reinterpret_cast(pf16_dst); for (int m = 0; m < num_rows; m++, src0 += src_stride, src1 += src_stride, dst += dst_stride) { // the prefetch distance is increased to ensure by the time store happens // prefetch has done and no HW prefetcher is triggered @@ -314,7 +520,8 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator { } // convert tensor to bf16: ConvertFP32toBF16(src0) - void call(float * src0, size_t src_stride, ov::bfloat16 * dst, size_t dst_stride, int num_rows, int num_cols) { + void call(float * src0, size_t src_stride, void * pf16_dst, size_t dst_stride, int num_rows, int num_cols) { + auto* dst = reinterpret_cast(pf16_dst); for (int m = 0; m < num_rows; m++, src0 += src_stride, dst += dst_stride) { // the prefetch distance is increased to ensure by the time store happens // prefetch has done and no HW prefetcher is triggered diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.cpp index 996561a04af153..f95d8da62111dc 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.cpp @@ -4,10 +4,9 @@ #include "mlp_utils.hpp" - #include #if defined(HAVE_AVX512F) -#include +# include #endif #include "../scaled_attn/transpose_kernel.hpp" @@ -17,12 +16,181 @@ namespace Cpu { namespace XARCH { void llm_mlp_transpose_epi32_16x16(void* dst, void* src, int stride) { - transpose_16x16_kernel(reinterpret_cast(dst), reinterpret_cast(src), 16, stride/sizeof(uint32_t)); + transpose_16x16_kernel(reinterpret_cast(dst), + reinterpret_cast(src), + 16, + stride / sizeof(uint32_t)); +} + +template +void llm_mlp_quantize_to_i8(T* psrc, + int src_stride, + int8_t* pdst, + int dst_stride, + int rows, + int cols, + float* p_scales, + float* p_zp, + bool asym) { + auto clamp_i8 = [](float x) { + auto v = static_cast(std::round(x)); + if (v < -128) + return -128; + if (v > 127) + return 127; + return v; + }; + + for (int y = 0; y < rows; y++, psrc += src_stride, pdst += dst_stride) { + int x = 0; + float f_min, f_max; +#if defined(HAVE_AVX512F) + auto v_max = mm512_uni_loadu_ps(psrc + 0); + auto v_min = mm512_uni_loadu_ps(psrc + 0); + for (; x + 16 <= cols; x += 16) { + auto v = mm512_uni_loadu_ps(psrc + x); + v_max = _mm512_max_ps(v, v_max); + v_min = _mm512_min_ps(v, v_min); + } + f_max = _mm512_reduce_max_ps(v_max); + f_min = _mm512_reduce_min_ps(v_min); +#else + f_min = psrc[0]; + f_max = psrc[0]; +#endif + for (; x < cols; x++) { + auto f_cur = static_cast(psrc[x]); + f_min = std::min(f_min, f_cur); + f_max = std::max(f_max, f_cur); + } + // (q - z) * s = f + // (-128 - z) * s = f_min; + // ( 127 - z) * s = f_max; + float scale, zp; + if (f_max == f_min || std::isnan(f_max) || std::isnan(f_min)) { + // special case + p_zp[y] = 0; + p_scales[y] = std::isnan(f_min) ? 0 : f_min; +#if defined(HAVE_AVX512F) + auto vi8x16 = _mm_set1_epi8(1); + for (; x + 16 <= cols; x += 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(pdst + x), vi8x16); + } +#endif + for (; x < cols; x++) { + pdst[x] = 1; + } + continue; + } else if (asym) { + scale = (f_max - f_min) / 255.0f; + zp = 127 - (f_max / scale); + } else { + auto fx = std::max(std::abs(f_max), std::abs(f_min)); + scale = fx / 127.0f; + zp = 0; + } + p_zp[y] = zp; + p_scales[y] = scale; + x = 0; +#if defined(HAVE_AVX512F) + auto vscale = _mm512_set1_ps(1.0f / scale); + auto vzp = _mm512_set1_ps(zp); + for (; x + 16 <= cols; x += 16) { + auto v = mm512_uni_loadu_ps(psrc + x); + v = _mm512_fmadd_ps(v, vscale, vzp); + auto vi32x16 = _mm512_cvtps_epi32(v); + auto vi8x16 = _mm512_cvtepi32_epi8(vi32x16); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pdst + x), vi8x16); + } +#endif + for (; x < cols; x++) { + pdst[x] = clamp_i8(psrc[x] / scale + zp); + } + } +} + +void llm_mlp_quantize_bf16_i8(ov::bfloat16* psrc, + int src_stride, + int8_t* pdst, + int dst_stride, + int rows, + int cols, + float* p_scales, + float* p_zp, + bool asym) { + llm_mlp_quantize_to_i8(psrc, src_stride, pdst, dst_stride, rows, cols, p_scales, p_zp, asym); +} + +void llm_mlp_quantize_f16_i8(ov::float16* psrc, + int src_stride, + int8_t* pdst, + int dst_stride, + int rows, + int cols, + float* p_scales, + float* p_zp, + bool asym) { + llm_mlp_quantize_to_i8(psrc, src_stride, pdst, dst_stride, rows, cols, p_scales, p_zp, asym); +} + +void llm_mlp_dequantize_i32_f32(int Batch, + int OC, + int32_t* src, + int stride_src, + float* dst, + int stride_dst, + float* p_src_scale_per_row, + float* p_src_zp_per_row, + float* p_wsum_per_oc, + float* p_wscale_per_oc, + bool asym) { + for (int b = 0; b < Batch; b++, src += stride_src, dst += stride_dst) { + float s1 = p_src_scale_per_row[b]; + float z1s1 = p_src_zp_per_row[b] * s1; + int oc = 0; +#if defined(HAVE_AVX512F) + if (asym) { + auto vs1 = _mm512_set1_ps(s1); + auto vz1s1 = _mm512_set1_ps(z1s1); + for (; oc + 16 <= OC; oc += 16) { + auto vs2 = mm512_uni_loadu_ps(p_wscale_per_oc + oc); + auto vi32x16 = _mm512_loadu_si512(src + oc); + auto vsrc = _mm512_cvtepi32_ps(vi32x16); + auto vwsum = mm512_uni_loadu_ps(p_wsum_per_oc + oc); + auto vwsum_z1s1 = _mm512_mul_ps(vwsum, vz1s1); + vsrc = _mm512_fmsub_ps(vsrc, vs1, vwsum_z1s1); + vsrc = _mm512_mul_ps(vsrc, vs2); + mm512_uni_storeu_ps(dst + oc, vsrc); + } + } else { + auto vs1 = _mm512_set1_ps(s1); + for (; oc + 16 <= OC; oc += 16) { + auto vs2 = mm512_uni_loadu_ps(p_wscale_per_oc + oc); + auto vi32x16 = _mm512_loadu_si512(src + oc); + auto vsrc = _mm512_cvtepi32_ps(vi32x16); + vsrc = _mm512_mul_ps(vsrc, vs1); + vsrc = _mm512_mul_ps(vsrc, vs2); + mm512_uni_storeu_ps(dst + oc, vsrc); + } + } +#endif + for (; oc < OC; oc++) { + // + // fdst = sum{(qi-z1)*s1 * wi*s2} + // = sum{[qi*wi] *(s1*s2) - z1*s1*wi*s2} + // = sum{qi*wi} *(s1*s2) - sum{wi}*(z1*s1*s2) + // + float s2 = p_wscale_per_oc[oc]; + if (asym) { + dst[oc] = src[oc] * (s1 * s2) - p_wsum_per_oc[oc] * (z1s1 * s2); + } else { + dst[oc] = src[oc] * (s1 * s2); // - sum_w *(z1 * s1 * s2); + } + } + } } } // namespace XARCH } // namespace Cpu } // namespace Extensions } // namespace ov - - diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.hpp index 075717971f43f4..d60cf143d374e5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_utils.hpp @@ -2,9 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include #include #include -#include + +#include "openvino/core/type/bfloat16.hpp" +#include "openvino/core/type/float16.hpp" namespace ov { namespace Extensions { @@ -12,7 +15,35 @@ namespace Cpu { namespace XARCH { void llm_mlp_transpose_epi32_16x16(void* dst, void* src, int stride); - +void llm_mlp_quantize_bf16_i8(ov::bfloat16* psrc, + int src_stride, + int8_t* pdst, + int dst_stride, + int rows, + int cols, + float* p_scales, + float* p_zp, + bool asym); +void llm_mlp_quantize_f16_i8(ov::float16* psrc, + int src_stride, + int8_t* pdst, + int dst_stride, + int rows, + int cols, + float* p_scales, + float* p_zp, + bool asym); +void llm_mlp_dequantize_i32_f32(int Batch, + int OC, + int32_t* src, + int stride_src, + float* dst, + int stride_dst, + float* p_src_scale_per_row, + float* p_src_zp_per_row, + float* p_wsum_per_oc, + float* p_wscale_per_oc, + bool asym); } // namespace XARCH } // namespace Cpu } // namespace Extensions diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp index 72f9ca872003ee..13c46a7c976cfd 100644 --- a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp +++ b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp @@ -14,8 +14,11 @@ #include "shape_inference/shape_inference_internal_dyn.hpp" #include "utils/plain_tensor.hpp" +#include "openvino/core/parallel.hpp" + #if defined(OPENVINO_ARCH_X86_64) #include "kernels/x64/mlp_kernel.hpp" +#include "kernels/x64/mlp_utils.hpp" #endif namespace ov { @@ -24,6 +27,7 @@ namespace node { #if defined(OPENVINO_ARCH_X86_64) +template class LinearKsplit2 { public: std::vector works; @@ -35,22 +39,24 @@ class LinearKsplit2 { LinearKsplit2() {} ReduceAdd2bh * p_jit_reduce2bh; + // weight [N, K] // Gate & Up are interleaved in N dimension: 16-gate / 16-up // and post-ops will compute silu(gate)*up in unit of 16 elements // and store out as bfloat16. - template - void setup(T* p_weight, int stride, int N, int K) { - static ReduceAdd2bh jit_reduce2bh_2(true); + void setup(void* p_weight, int stride, int N, int K, const LLMMLPNode::Config& config) { + bool is_quantized = config.down_quantized; + + auto reg_blk_K_size = is_quantized ? REG_BLK_K_SIZE_I8 : REG_BLK_K_SIZE; + auto cache_blk_k_size = is_quantized ? CACHE_BLK_K_SIZE : CACHE_BLK_K_SIZE; + auto weight_element_size = is_quantized ? sizeof(int8_t) : sizeof(ov::float16); OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); - OPENVINO_ASSERT((K % REG_BLK_K_SIZE) == 0); + OPENVINO_ASSERT((K % reg_blk_K_size) == 0); auto nthr = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; works.resize(nthr); - p_jit_reduce2bh = &jit_reduce2bh_2; - auto K_splits = 2; // split task on more cores is better on TBB auto valid_nthr = nthr / 2; @@ -70,7 +76,7 @@ class LinearKsplit2 { // split K dimension in unit of 32 evenly among 2 worker-threads auto start_blkK = 0; - auto num_blk_K = K / REG_BLK_K_SIZE; + auto num_blk_K = K / reg_blk_K_size; auto blkK_per_thread = (num_blk_K + 1) / 2; for (int ik = 0; ik < K_splits; ik++) { auto blk_K = std::min(num_blk_K - start_blkK, blkK_per_thread); @@ -78,13 +84,15 @@ class LinearKsplit2 { auto& work = works[ithr + ik]; work.sync_flag = shared_atomic; - work.blk_K_size = CACHE_BLK_K_SIZE; + work.blk_K_size = cache_blk_k_size; work.n0 = (start_blkN) * REG_BLK_N_SIZE; work.n1 = (start_blkN + blkN) * REG_BLK_N_SIZE; work.BN = blkN * REG_BLK_N_SIZE; - work.k0 = start_blkK * REG_BLK_K_SIZE; - work.k1 = (start_blkK + blk_K) * REG_BLK_K_SIZE; + work.k0 = start_blkK * reg_blk_K_size; + work.k1 = (start_blkK + blk_K) * reg_blk_K_size; + work.quant_i8 = is_quantized; + work.is_f16 = std::is_same::value; start_blkK += blk_K; used_nthr++; @@ -96,39 +104,70 @@ class LinearKsplit2 { DEBUG_LOG("Linear N,K=", N, ",", K, " used_nthr=", used_nthr); - wbuffer.alloc(works); + wbuffer.alloc(works, weight_element_size); ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { - work.setup(wbuffer.get(ithr), p_weight, stride); + if (is_quantized) { + work.setup(wbuffer.get(ithr), reinterpret_cast(p_weight), stride, true); + } else { + work.setup(wbuffer.get(ithr), reinterpret_cast(p_weight), stride); + } } }); DEBUG_LOG(" setup is done. weight @ ", static_cast(p_weight)); } - void run(uint8_t* pA, int strideA, int M, ov::bfloat16* dstC, int strideC) { + void run(uint8_t* pA, int strideA, int M, T* dstC, int strideC, + const LLMMLPNode::Config& config, + MatrixDynQuantPerRow& src_dq, + float * w_scale) { + static ReduceAdd2bh jit_reduce2cvt(true, std::is_same::value); + ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; auto& workC = work.m_C; if (work) { work.run(M, pA, strideA); + if (config.down_quantized) { + // de-quantize i32 results in-place into f32 + auto* ptr_c = work.m_C.template ptr(); + auto* ptr_wsum = work.w_sum_per_oc.template ptr(); + auto stride_c = work.m_C.stride(0); + ov::Extensions::Cpu::XARCH::llm_mlp_dequantize_i32_f32( + M, + work.BN, + reinterpret_cast(ptr_c), + stride_c, + ptr_c, + stride_c, + src_dq.scale, + src_dq.zp, + ptr_wsum, + w_scale + work.n0, + src_dq.asym); + } + auto sync_id = work.sync_flag->fetch_add(1); // (0,1) (2,3) if (sync_id & 1) { auto peer_ithr = (ithr & 1) ? (ithr - 1) : (ithr + 1); - auto& peerC = works[peer_ithr].m_C; + auto* p_peerC = works[peer_ithr].m_C.template ptr(); // the other one has finished, we can do the reduce sum - p_jit_reduce2bh->call(workC.ptr(), peerC.ptr(), workC.stride(0), - dstC + work.n0, strideC / sizeof(*dstC), - M, work.BN); + auto* p_curC = workC.template ptr(); + jit_reduce2cvt.call(p_curC, p_peerC, + workC.stride(0), + dstC + work.n0, strideC / sizeof(*dstC), + M, work.BN); } } }); } }; +template class LinearGateUp { public: std::vector works; @@ -145,10 +184,9 @@ class LinearGateUp { // Gate & Up are interleaved in N dimension: 16-gate / 16-up // and post-ops will compute silu(gate)*up in unit of 16 elements // and store out as bfloat16. - template - void setup(T* p_weight_gate, T* p_weight_up, int stride, int N, int K, const LLMMLPNode::Config& config) { - static GateUpCombine jit_gateup_silu(dnnl_eltwise_swish); - static GateUpCombine jit_gateup_gelu(dnnl_eltwise_gelu_tanh); + void setup(void* p_weight_gate, void* p_weight_up, int stride, int N, int K, const LLMMLPNode::Config& config) { + static GateUpCombine jit_gateup_silu(dnnl_eltwise_swish, std::is_same::value); + static GateUpCombine jit_gateup_gelu(dnnl_eltwise_gelu_tanh, std::is_same::value); if (config.act == LLMMLPNode::ACT_FN::GELU) jit_gateup = &jit_gateup_gelu; @@ -157,10 +195,16 @@ class LinearGateUp { else OPENVINO_THROW("unsupported act in GateUpCombine"); + bool quantized_int8 = config.gate_up_quantized; + + auto reg_blk_K_size = quantized_int8 ? REG_BLK_K_SIZE_I8 : REG_BLK_K_SIZE; + auto cache_blk_k_size = quantized_int8 ? CACHE_BLK_K_SIZE : CACHE_BLK_K_SIZE; + auto weight_element_size = quantized_int8 ? sizeof(int8_t) : sizeof(ov::float16); + // prepare weights, split N among threads // in unit of 32 OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); - OPENVINO_ASSERT((K % REG_BLK_K_SIZE) == 0); + OPENVINO_ASSERT((K % reg_blk_K_size) == 0); auto nthr = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; works.resize(nthr); @@ -182,41 +226,78 @@ class LinearGateUp { auto shared_atomic = std::make_shared(0); auto& work = works[ithr]; work.sync_flag = shared_atomic; - work.blk_K_size = CACHE_BLK_K_SIZE; + work.blk_K_size = cache_blk_k_size; work.n0 = (start_blkN) * REG_BLK_N_SIZE; work.n1 = (start_blkN + blkN) * REG_BLK_N_SIZE; work.BN = blkN * REG_BLK_N_SIZE; work.k0 = 0; work.k1 = K; + work.quant_i8 = quantized_int8; + work.is_f16 = std::is_same::value; used_nthr++; } start_blkN += blkN; } - wbuffer.alloc(works); + wbuffer.alloc(works, weight_element_size); DEBUG_LOG("Linear N,K=", N, ",", K, " used_nthr=", used_nthr); ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { - work.setup(wbuffer.get(ithr), p_weight_gate, p_weight_up, stride); + if (quantized_int8) + work.setup(wbuffer.get(ithr), + reinterpret_cast(p_weight_gate), + reinterpret_cast(p_weight_up), + stride, true); + else + work.setup(wbuffer.get(ithr), + reinterpret_cast(p_weight_gate), + reinterpret_cast(p_weight_up), + stride); } }); DEBUG_LOG(" setup is done. weight @ ", static_cast(p_weight_gate)); } // gate & up are interleaved: 16 gates + 16 up - void runGateUp(uint8_t* pA, int strideA, int M, - ov::bfloat16* dstC, int strideC, - const LLMMLPNode::Config& config) { + void runGateUp(uint8_t* pA, int strideA_in_bytes, int M, + T* dstC, int strideC, + const LLMMLPNode::Config& config, + MatrixDynQuantPerRow& src_dq, + float * w_scale) { ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { - work.run(M, pA, strideA); + work.run(M, pA, strideA_in_bytes); + // K reduce is done, results of [M, BN] sub-block is ready in L2. // combine Gate & Up - jit_gateup->call(work.m_C.ptr(), work.m_C.stride(0), + float * ptr_c; + size_t stride_c; + if (config.gate_up_quantized) { + // dequantize m_C in-place + ptr_c = work.m_C.template ptr(); + stride_c = work.m_C.stride(0); + auto* p_wsum = work.w_sum_per_oc.template ptr(); + ov::Extensions::Cpu::XARCH::llm_mlp_dequantize_i32_f32( + M, + work.BN, + reinterpret_cast(ptr_c), + stride_c, + ptr_c, + stride_c, + src_dq.scale, + src_dq.zp, + p_wsum, + w_scale + work.n0, + src_dq.asym); + } else { + ptr_c = work.m_C.template ptr(); + stride_c = work.m_C.stride(0); + } + jit_gateup->call(ptr_c, stride_c, dstC + (work.n0 / 2), strideC / sizeof(*dstC), M, work.BN); } @@ -224,31 +305,68 @@ class LinearGateUp { } }; -struct LLMMLP::Impl { +template +struct LLMMLP::Executor : public LLMMLP::ExecutorBase { + LLMMLP* m_pnode; const LLMMLPNode::Config m_config; DnnlScratchPadPtr m_scrachPad; MemoryPtr m_scratchMem; uint8_t* m_scratch_base = nullptr; - LinearGateUp gate_up; - LinearKsplit2 down; + LinearGateUp gate_up; + LinearKsplit2 down; int m_N; int m_M = 0; // MLP is not supposed to run in parallel PlainTensor m_actUp; + // quantized input: in scratch buffer + MatrixDynQuantPerRow m_quant_act; + MatrixDynQuantPerRow m_quant_up_act; + + PlainTensor m_w_scale_gateup; + + bool m_rt_prec_f16; + // [M, K] x [N, K] => [M, N] x [K, N] => [M, K] // w_gate/w_up : [N, K] // w_down : [K, N] - Impl(PlainTensor w_gate, PlainTensor w_up, PlainTensor w_down, const LLMMLPNode::Config& config, DnnlScratchPadPtr scrachPad) - : m_config(config), m_scrachPad(scrachPad) { + Executor(LLMMLP* pnode, const LLMMLPNode::Config& config, DnnlScratchPadPtr scrachPad) + : m_pnode(pnode), m_config(config), m_scrachPad(scrachPad) { + PlainTensor w_gate(pnode->getSrcMemoryAtPort(1)); + PlainTensor w_up(pnode->getSrcMemoryAtPort(2)); + PlainTensor w_down(pnode->getSrcMemoryAtPort(3)); + + m_rt_prec_f16 = std::is_same::value; + // [N, K] [N, K] interleave (16-16-...) into [2*N, K] auto K = w_gate.size(1); auto N = w_gate.size(0); OPENVINO_ASSERT(w_gate.stride_bytes(0) == w_up.stride_bytes(0)); - gate_up.setup(w_gate.ptr(), w_up.ptr(), w_up.stride_bytes(0), N * 2, K, config); - down.setup(w_down.ptr(), w_down.stride_bytes(0), K, N); + if (m_config.gate_up_combined) { + N = w_gate.size(0) / 2; + gate_up.setup(w_gate.ptr_v(), w_up.ptr_v(N, 0), w_up.stride_bytes(0), N * 2, K, config); + } else { + gate_up.setup(w_gate.ptr_v(), w_up.ptr_v(), w_up.stride_bytes(0), N * 2, K, config); + } + down.setup(w_down.ptr_v(), w_down.stride_bytes(0), K, N, config); + + if (m_config.gate_up_quantized) { + m_w_scale_gateup.resize({N * 2}); + auto* w_scale_gate = pnode->getSrcMemoryAtPort(4)->getDataAs(); + auto* w_scale_up = pnode->getSrcMemoryAtPort(5)->getDataAs(); + auto* dst = m_w_scale_gateup.ptr(); + if (m_config.gate_up_combined) { + w_scale_up = w_scale_gate + N; + } + for (size_t i = 0; i < N; i += 16) { + memcpy(dst, w_scale_gate + i, 16 * sizeof(float)); + dst += 16; + memcpy(dst, w_scale_up + i, 16 * sizeof(float)); + dst += 16; + } + } m_N = N; } @@ -259,69 +377,125 @@ struct LLMMLP::Impl { cur_scratch_base = m_scratchMem->getDataAs(); // new M larger than previous or the scratch pointer is changed after the following allocation if (m_M < M || cur_scratch_base != m_scratch_base) { - size_t total_scratch_size = M * m_N * sizeof(ov::bfloat16); - std::vector scratch_offsets; + ScratchBuffAllocator allocator; + + allocator.register_allocation(M * m_N * sizeof(T), [&](void* ptr) { + m_actUp.resize({static_cast(M), static_cast(m_N)}, + reinterpret_cast(ptr)); + }); + auto nthr = parallel_get_max_threads(); for (int ithr = 0; ithr < nthr; ithr++) { - scratch_offsets.push_back(total_scratch_size); auto C1_size = gate_up.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto C2_size = down.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto max_C_size = std::max(C1_size, C2_size); - total_scratch_size += max_C_size; + allocator.register_allocation(max_C_size, [this, ithr, M](void* ptr) { + // these two op runs at different time step, so can share same scratch buffer + gate_up.works[ithr].set_C(M, reinterpret_cast(ptr)); + down.works[ithr].set_C(M, reinterpret_cast(ptr)); + }); } - auto newMemDesc = std::make_shared(ov::element::u8, Shape{total_scratch_size}); - m_scratchMem = m_scrachPad->createScratchPadMem(newMemDesc); + if (m_config.gate_up_quantized) { + m_quant_act.M = M; + m_quant_act.K = m_config.hidden_size; + allocator.register_allocation(m_quant_act.size(), [&](void* ptr){ + m_quant_act.setup(ptr); + }); + } + if (m_config.down_quantized) { + m_quant_up_act.M = M; + m_quant_up_act.K = m_config.up_size; + allocator.register_allocation(m_quant_up_act.size(), [&](void* ptr){ + m_quant_up_act.setup(ptr); + }); + } + + auto newMemDesc = std::make_shared(ov::element::u8, Shape{allocator.size()}); + m_scratchMem = m_scrachPad->createScratchPadMem(newMemDesc); m_scratch_base = m_scratchMem->getDataAs(); - m_actUp.resize({static_cast(M), static_cast(m_N)}, reinterpret_cast(m_scratch_base)); - for (int ithr = 0; ithr < nthr; ithr++) { - auto C_base = reinterpret_cast(m_scratch_base + scratch_offsets[ithr]); - gate_up.works[ithr].set_C(M, C_base); - down.works[ithr].set_C(M, C_base); - } + allocator.finalize(m_scratch_base); m_M = M; } } - void execute(LLMMLP* pnode) { - auto input = pnode->getSrcMemoryAtPort(0); + void execute() override { + auto input = m_pnode->getSrcMemoryAtPort(0); const auto& ishape = input->getStaticDims(); uint8_t* pA = input->getDataAs(); const auto& srcStrides = input->getDescWithType()->getStrides(); - int strideA = srcStrides[srcStrides.size() - 2] * sizeof(ov::bfloat16); + int strideA = srcStrides[srcStrides.size() - 2]; + int strideA_in_bytes = strideA * sizeof(T); int M = shape_size(ishape) / ishape[ishape.size() - 1]; - auto output = pnode->getDstMemoryAtPort(0); - auto* dstC = output->getDataAs(); + auto output = m_pnode->getDstMemoryAtPort(0); + auto* dstC = output->getDataAs(); const auto& dstStrides = output->getDescWithType()->getStrides(); - int strideC = dstStrides[dstStrides.size() - 2] * sizeof(ov::bfloat16); + int strideC = dstStrides[dstStrides.size() - 2] * sizeof(T); + + float* p_w_scale_down = nullptr; + if (m_config.down_quantized) { + p_w_scale_down = m_pnode->getSrcMemoryAtPort(6)->getDataAs(); + } for (int m = 0; m < M;) { int BM = std::min(M - m, CACHE_BLK_M_SIZE); setM(BM); - gate_up.runGateUp(pA, strideA, BM, m_actUp.ptr(), m_actUp.stride_bytes(0), m_config); - down.run(reinterpret_cast(m_actUp.ptr()), m_actUp.stride_bytes(0), BM, dstC, strideC); + + uint8_t* psrc = pA; + auto stride_src_in_bytes = strideA_in_bytes; + auto strideA_in_bytes = strideA * sizeof(T); + if (m_config.gate_up_quantized) { + m_quant_act.quantize(BM, reinterpret_cast(pA), strideA); + psrc = reinterpret_cast(m_quant_act.data); + stride_src_in_bytes = m_quant_act.K; + } + + // dequantize is fused into gate_up + gate_up.runGateUp(psrc, + stride_src_in_bytes, + BM, + m_actUp.ptr(), + m_actUp.stride_bytes(0), + m_config, + m_quant_act, + m_w_scale_gateup.ptr()); + + uint8_t * p_up_act = reinterpret_cast(m_actUp.ptr()); + size_t stride_up_act = m_actUp.stride_bytes(0); + if (m_config.down_quantized) { + m_quant_up_act.quantize(BM, m_actUp.ptr(), m_actUp.stride(0)); + p_up_act = reinterpret_cast(m_quant_up_act.data); + stride_up_act = m_quant_up_act.stride(); + } + + down.run(p_up_act, stride_up_act, BM, dstC, strideC, + m_config, + m_quant_up_act, + p_w_scale_down); m += BM; - pA += BM * strideA; - dstC += BM * strideC / sizeof(ov::bfloat16); + pA += BM * strideA_in_bytes; + dstC += BM * strideC / sizeof(T); } } }; #else -struct LLMMLP::Impl { - Impl(PlainTensor w_gate, PlainTensor w_up, PlainTensor w_down, const LLMMLPNode::Config& config, DnnlScratchPadPtr scrachPad) {} - void execute(LLMMLP* pnode) {} +template +struct LLMMLP::Executor : public LLMMLP::ExecutorBase { + Executor(LLMMLP* pnode, const LLMMLPNode::Config& config, DnnlScratchPadPtr scrachPad) {} + void execute() {} }; #endif LLMMLP::LLMMLP(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { + const auto & config = context->getConfig(); + if (!isSupportedOperation(op, errorMessage, config.fcDynamicQuantizationGroupSize)) { OPENVINO_THROW("CPU: " + errorMessage); } const auto node_mlp = std::dynamic_pointer_cast(op); @@ -332,39 +506,73 @@ void LLMMLP::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - auto rtPrecision = ov::element::bf16; - auto weightPrecision = ov::element::f16; - - // initialize input ports std::vector inPortConfigs; - inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // gate - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // up - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1); // down - - // initialize output port std::vector outPortConfigs; - outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); + auto rtPrecision = getOriginalInputPrecisionAtPort(0); + + if (rtPrecision == ov::element::f32) { + // fallback to supported precision if possible + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16)) { + rtPrecision = ov::element::f16; + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + rtPrecision = ov::element::bf16; + } + } + + OPENVINO_ASSERT(rtPrecision == ov::element::bf16 || rtPrecision == ov::element::f16, "Unexpected rtPrecision:", rtPrecision); + + if (m_mlp_config.gate_up_quantized) { + auto weightPrecision = ov::element::i8; + + // initialize input ports + inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // gate + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // up + inPortConfigs.emplace_back(LayoutType::ncsp, m_mlp_config.down_quantized ? ov::element::i8 : ov::element::f16, + getInputShapeAtPort(3), false, -1); // down + inPortConfigs.emplace_back(LayoutType::ncsp, ov::element::f32, getInputShapeAtPort(4), false, -1); // gate_weight scales per OC + inPortConfigs.emplace_back(LayoutType::ncsp, ov::element::f32, getInputShapeAtPort(5), false, -1); // up_weight scales per OC + if (m_mlp_config.down_quantized) + inPortConfigs.emplace_back(LayoutType::ncsp, ov::element::f32, getInputShapeAtPort(6), false, -1); // down_weight scales per OC + + // initialize output port + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); + } else { + auto weightPrecision = ov::element::f16; + + // initialize input ports + inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // gate + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // up + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1); // down + + // initialize output port + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); + } addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any); } -void LLMMLP::prepareParams() { - if (!m_pimpl) { - m_pimpl = std::make_shared(getSrcMemoryAtPort(1), - getSrcMemoryAtPort(2), - getSrcMemoryAtPort(3), - m_mlp_config, - context->getScratchPad()); +void LLMMLP::createPrimitive() { + auto rtPrecision = getInputPrecisions()[0]; +#ifdef OPENVINO_ARCH_X86_64 + if (rtPrecision == ov::element::bf16) { + m_executor = std::make_shared>(this, m_mlp_config, context->getScratchPad()); + } else if (rtPrecision == ov::element::f16) { + m_executor = std::make_shared>(this, m_mlp_config, context->getScratchPad()); + } +#endif + if (!m_executor) { + OPENVINO_THROW("LLMMLP Executor creation fails with precision " + rtPrecision.to_string()); } } void LLMMLP::execute(dnnl::stream strm) { MAYBE_UNUSED(strm); - m_pimpl->execute(this); + m_executor->execute(); } -bool LLMMLP::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool LLMMLP::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage, uint64_t fcDynamicQuantizationGroupSize) noexcept { #if defined(OPENVINO_ARCH_X86_64) try { const auto node_mlp = std::dynamic_pointer_cast(op); @@ -377,6 +585,18 @@ bool LLMMLP::isSupportedOperation(const std::shared_ptr& op, std } auto down_size = down_proj_w_pshape[0].get_length(); auto up_size = down_proj_w_pshape[1].get_length(); + + auto& config = node_mlp->get_config(); + if (config.gate_up_quantized && (fcDynamicQuantizationGroupSize < static_cast(config.hidden_size))) { + errorMessage = "LLMMLPNode gate-up-proj only support per-token dynamic quantization"; + return false; + } + + if (config.down_quantized && (fcDynamicQuantizationGroupSize < static_cast(config.up_size))) { + errorMessage = "LLMMLPNode down_proj only support per-token dynamic quantization"; + return false; + } + if (down_size % REG_BLK_K_SIZE) { errorMessage = "LLMMLPNode down_proj size is not multiple of register blocking size"; return false; diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.h b/src/plugins/intel_cpu/src/nodes/llm_mlp.h index 3cdf6a0f1f7295..47f9faf70c5cc3 100644 --- a/src/plugins/intel_cpu/src/nodes/llm_mlp.h +++ b/src/plugins/intel_cpu/src/nodes/llm_mlp.h @@ -20,18 +20,27 @@ class LLMMLP : public Node { bool created() const override { return getType() == Type::LLMMLP; } - void prepareParams() override; + bool needPrepareParams() const override { + return false; + } + void createPrimitive() override; void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage, + uint64_t fcDynamicQuantizationGroupSize = 0) noexcept; private: - struct Impl; + struct ExecutorBase { + virtual void execute() = 0; + virtual ~ExecutorBase() = default; + }; + std::shared_ptr m_executor; + template struct Executor; LLMMLPNode::Config m_mlp_config; - std::shared_ptr m_pimpl; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp index 27ed49b2301c2b..3260b12f1b5b4b 100644 --- a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp +++ b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp @@ -7,6 +7,7 @@ #include #include +#include "common/primitive_hashing_utils.hpp" #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" #include "cpu/x64/cpu_isa_traits.hpp" @@ -14,6 +15,15 @@ #include "shape_inference/shape_inference_internal_dyn.hpp" #include "utils/plain_tensor.hpp" +#if defined(OPENVINO_ARCH_X86_64) +#include "kernels/x64/mlp_utils.hpp" +#endif + +#include "openvino/core/parallel.hpp" + +using namespace dnnl::impl; +using namespace dnnl::impl::utils; + namespace ov { namespace intel_cpu { namespace node { @@ -42,7 +52,8 @@ static std::vector allocate_workers(const std::vector& grouped_works, return g_workers; } -struct QKVProjection::Impl { +template +struct QKVProjection::Executor : public QKVProjection::ExecutorBase { std::vector works; QKVProjection * m_node; DnnlScratchPadPtr m_scrachPad; @@ -50,23 +61,32 @@ struct QKVProjection::Impl { uint8_t* m_scratch_base = nullptr; int m_M = 0; + MatrixDynQuantPerRow m_quant_act; + WeightBuffer wbuffer; - Impl(QKVProjection * pnode, DnnlScratchPadPtr scrachPad) : m_node(pnode), m_scrachPad(scrachPad) { + Executor(QKVProjection * pnode, DnnlScratchPadPtr scrachPad) : m_node(pnode), m_scrachPad(scrachPad) { PlainTensor w0(pnode->getSrcMemoryAtPort(1)); PlainTensor w1(pnode->getSrcMemoryAtPort(2)); PlainTensor w2(pnode->getSrcMemoryAtPort(3)); + // in quantized mode, weights are already quantized in per-OC mode into INT8 + // and activations will be dynamically per-token quantized and using AMX-INT8 to get the result + bool quantized_int8 = m_node->m_config.quantized; + + auto cache_blk_k_size = quantized_int8 ? CACHE_BLK_K_SIZE : CACHE_BLK_K_SIZE; + auto weight_element_size = quantized_int8 ? sizeof(int8_t) : sizeof(ov::float16); + auto K = w0.size(1); - OPENVINO_ASSERT((K % CACHE_BLK_K_SIZE) == 0); + OPENVINO_ASSERT((K % cache_blk_k_size) == 0); auto nthr = parallel_get_max_threads(); - auto num_blk_K = K / CACHE_BLK_K_SIZE; - int stride = K * sizeof(ov::float16); + auto num_blk_K = K / cache_blk_k_size; + int stride_in_bytes = K * weight_element_size; works.resize(nthr); int cur_work_id = 0; - auto create_works = [&](ov::float16* pw, int output_id, int N, int valid_nthr) { + auto create_works = [&](void* pw, int output_id, int N, int valid_nthr) { // split task on more cores is better on TBB OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); auto num_blk_N = N / REG_BLK_N_SIZE; @@ -82,37 +102,51 @@ struct QKVProjection::Impl { } if (blkN) { auto& work = works[cur_work_id++]; - work.blk_K_size = CACHE_BLK_K_SIZE; + work.blk_K_size = cache_blk_k_size; work.n0 = (start_blkN) * REG_BLK_N_SIZE; work.n1 = (start_blkN + blkN) * REG_BLK_N_SIZE; work.BN = blkN * REG_BLK_N_SIZE; work.k0 = 0; - work.k1 = CACHE_BLK_K_SIZE * num_blk_K; + work.k1 = cache_blk_k_size * num_blk_K; work.output_id = output_id; work.p_raw_weights = pw; + work.quant_i8 = quantized_int8; + work.is_f16 = std::is_same::value; } start_blkN += blkN; } }; - auto proj_size0 = static_cast(w0.size(0)); - auto proj_size1 = static_cast(w1.size(0)); - auto proj_size2 = static_cast(w2.size(0)); + auto proj_size0 = m_node->m_config.proj_size0; + auto proj_size1 = m_node->m_config.proj_size1; + auto proj_size2 = m_node->m_config.proj_size2; auto n_group_workers = allocate_workers({proj_size0, proj_size1, proj_size2}, nthr); - create_works(w0.ptr(), 0, proj_size0, n_group_workers[0]); - create_works(w1.ptr(), 1, proj_size1, n_group_workers[1]); - create_works(w2.ptr(), 2, proj_size2, n_group_workers[2]); + if (m_node->m_config.weights_combined) { + auto* ptr_weights = reinterpret_cast(w0.ptr_v()); + create_works(ptr_weights, 0, proj_size0, n_group_workers[0]); + ptr_weights += proj_size0 * stride_in_bytes; + create_works(ptr_weights, 1, proj_size1, n_group_workers[1]); + ptr_weights += proj_size1 * stride_in_bytes; + create_works(ptr_weights, 2, proj_size2, n_group_workers[2]); + } else { + create_works(w0.ptr_v(), 0, proj_size0, n_group_workers[0]); + create_works(w1.ptr_v(), 1, proj_size1, n_group_workers[1]); + create_works(w2.ptr_v(), 2, proj_size2, n_group_workers[2]); + } DEBUG_LOG("QKVProj hidden_size=", K, " proj_sizes=", proj_size0, ",", proj_size1, ",", proj_size2, " used_nthr=", cur_work_id); - wbuffer.alloc(works); + wbuffer.alloc(works, weight_element_size); ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { - work.setup(wbuffer.get(ithr), work.p_raw_weights, stride); + if (quantized_int8) + work.setup(wbuffer.get(ithr), reinterpret_cast(work.p_raw_weights), stride_in_bytes, true); + else + work.setup(wbuffer.get(ithr), reinterpret_cast(work.p_raw_weights), stride_in_bytes); } }); } @@ -123,145 +157,245 @@ struct QKVProjection::Impl { cur_scratch_base = m_scratchMem->getDataAs(); // new M larger than previous or the scratch pointer is changed after the following allocation if (m_M < M || cur_scratch_base != m_scratch_base) { - size_t total_scratch_size = 0; - std::vector scratch_offsets; + ScratchBuffAllocator allocator; for (auto& work : works) { if (work) { - scratch_offsets.push_back(total_scratch_size); auto C_size = work.set_C(M, reinterpret_cast(cur_scratch_base)); - total_scratch_size += C_size; + allocator.register_allocation(C_size, [&](void* ptr){ + work.set_C(M, reinterpret_cast(ptr)); + }); } } - auto newMemDesc = std::make_shared(ov::element::u8, Shape{total_scratch_size}); - m_scratchMem = m_scrachPad->createScratchPadMem(newMemDesc); + if (m_node->m_config.quantized) { + m_quant_act.M = M; + m_quant_act.K = m_node->m_config.hidden_size; + allocator.register_allocation(m_quant_act.size(), [&](void* ptr){ + m_quant_act.setup(ptr); + }); + } + // make sure scratch is big enough + auto newMemDesc = std::make_shared(ov::element::u8, Shape{allocator.size()}); + m_scratchMem = m_scrachPad->createScratchPadMem(newMemDesc); m_scratch_base = m_scratchMem->getDataAs(); - for (size_t ithr = 0; ithr < works.size(); ithr++) { - auto& work = works[ithr]; - if (work) { - work.set_C(M, reinterpret_cast(m_scratch_base + scratch_offsets[ithr])); - } - } + allocator.finalize(m_scratch_base); m_M = M; } } - void execute() { - static ReduceAdd2bh jit_2bh(false); + void execute() override { + static ReduceAdd2bh jit_cvt(false, std::is_same::value); + auto input = m_node->getSrcMemoryAtPort(0); const auto& ishape = input->getStaticDims(); - uint8_t* pA = input->getDataAs(); + uint8_t* psrc0 = input->getDataAs(); int M = shape_size(ishape) / ishape[ishape.size() - 1]; - auto* dst0 = m_node->getDstMemoryAtPort(0)->getDataAs(); - auto* dst1 = m_node->getDstMemoryAtPort(1)->getDataAs(); - auto* dst2 = m_node->getDstMemoryAtPort(2)->getDataAs(); + auto* dst0 = m_node->getDstMemoryAtPort(0)->getDataAs(); + auto* dst1 = m_node->getDstMemoryAtPort(1)->getDataAs(); + auto* dst2 = m_node->getDstMemoryAtPort(2)->getDataAs(); + + float* w_scale[3]; + + if (m_node->m_config.quantized) { + w_scale[0] = m_node->getSrcMemoryAtPort(4)->getDataAs(); + if (m_node->m_config.weights_combined) { + w_scale[1] = w_scale[0] + m_node->m_config.proj_size0; + w_scale[2] = w_scale[1] + m_node->m_config.proj_size1; + } else { + w_scale[1] = m_node->getSrcMemoryAtPort(5)->getDataAs(); + w_scale[2] = m_node->getSrcMemoryAtPort(6)->getDataAs(); + } + } const auto& srcStrides = input->getDescWithType()->getStrides(); const auto& dstStrides0 = m_node->getDstMemoryAtPort(0)->getDescWithType()->getStrides(); const auto& dstStrides1 = m_node->getDstMemoryAtPort(1)->getDescWithType()->getStrides(); const auto& dstStrides2 = m_node->getDstMemoryAtPort(2)->getDescWithType()->getStrides(); - int strideA = srcStrides[1] * sizeof(ov::bfloat16); - auto stride_0 = dstStrides0[1]; - auto stride_1 = dstStrides1[1]; - auto stride_2 = dstStrides2[1]; + int stride_src = srcStrides[1] * sizeof(T); + auto stride_dst_0 = dstStrides0[1]; + auto stride_dst_1 = dstStrides1[1]; + auto stride_dst_2 = dstStrides2[1]; + auto asym = true; for (int m = 0; m < M;) { int BM = std::min(M - m, CACHE_BLK_M_SIZE); setM(BM); + // dynamic quantize input tensor A[m0:m1, :] into scratch buffer + // because it's being shared by all kernels + uint8_t* pA = psrc0; + auto strideA = stride_src; + if (m_node->m_config.quantized) { + // quantize psrc0 into m_quantized_act buffer + // per-token asym + m_quant_act.quantize(BM, reinterpret_cast(psrc0), srcStrides[1]); + pA = reinterpret_cast(m_quant_act.data); + strideA = m_quant_act.K; + } + ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { work.run(BM, pA, strideA); - // compress accumulation result into target - auto* src = work.m_C.ptr(); - auto stride_src = work.m_C.stride(0); - ov::bfloat16* dst = nullptr; + // determine destination buffer + T* dst = nullptr; int stride_dst = 0; + if (work.output_id == 0) { dst = dst0 + work.n0; - stride_dst = stride_0; + stride_dst = stride_dst_0; } if (work.output_id == 1) { dst = dst1 + work.n0; - stride_dst = stride_1; + stride_dst = stride_dst_1; } if (work.output_id == 2) { dst = dst2 + work.n0; - stride_dst = stride_2; + stride_dst = stride_dst_2; } + auto* src = work.m_C.template ptr(); + auto stride_src = work.m_C.stride(0); + if (m_node->m_config.quantized) { + // dequantize output & convert to f32 in-place + auto* p_wsum = work.w_sum_per_oc.template ptr(); + ov::Extensions::Cpu::XARCH::llm_mlp_dequantize_i32_f32( + BM, + work.BN, + reinterpret_cast(src), + stride_src, + src, + stride_src, + m_quant_act.scale, + m_quant_act.zp, + p_wsum, + w_scale[work.output_id] + work.n0, + asym); + } + // compress accumulation result into target for (int mi = 0; mi < BM; mi++, src += stride_src, dst += stride_dst) { // the prefetch distance is increased to ensure by the time store happens // prefetch has done and no HW prefetcher is triggered auto* prefetch_dst = (mi + 2 < BM) ? (dst + 2 * stride_dst) : (dst); - jit_2bh(src, dst, prefetch_dst, work.BN); + jit_cvt(src, dst, prefetch_dst, work.BN); } } }); m += BM; - pA += BM * strideA; - dst0 += BM * stride_0; - dst1 += BM * stride_1; - dst2 += BM * stride_2; + psrc0 += BM * stride_src; + dst0 += BM * stride_dst_0; + dst1 += BM * stride_dst_1; + dst2 += BM * stride_dst_2; } } }; #else -struct QKVProjection::Impl { - Impl(QKVProjection * pnode, DnnlScratchPadPtr scrachPad) {} - void execute() {} +template +struct QKVProjection::Executor : public QKVProjection::ExecutorBase { + QKVProjection * m_pnode; + Executor(QKVProjection * pnode) : m_pnode(pnode) {} + void execute() override {} }; #endif -void QKVProjection::prepareParams() { - if (!m_pimpl) { - m_pimpl = std::make_shared(this, context->getScratchPad()); +void QKVProjection::createPrimitive() { + auto rtPrecision = getInputPrecisions()[0]; +#ifdef OPENVINO_ARCH_X86_64 + if (rtPrecision == ov::element::bf16) { + m_executor = std::make_shared>(this, context->getScratchPad()); + } else if (rtPrecision == ov::element::f16) { + m_executor = std::make_shared>(this, context->getScratchPad()); + } +#endif + if (!m_executor) { + OPENVINO_THROW("QKVProjection Executor creation fails with precision " + rtPrecision.to_string()); } } void QKVProjection::execute(dnnl::stream strm) { MAYBE_UNUSED(strm); - m_pimpl->execute(); + m_executor->execute(); } QKVProjection::QKVProjection(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { + + const auto & config = context->getConfig(); + size_t concurrency = config.streamExecutorConfig.get_threads_per_stream(); + if (concurrency == 0) + concurrency = parallel_get_max_threads(); + + if (!isSupportedOperation(op, errorMessage, concurrency, config.fcDynamicQuantizationGroupSize)) { OPENVINO_THROW("CPU: " + errorMessage); } + const auto node = std::dynamic_pointer_cast(op); + m_config = node->get_config(); } void QKVProjection::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - auto rtPrecision = ov::element::bf16; - auto weightPrecision = ov::element::f16; - - // initialize input ports std::vector inPortConfigs; - inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // q_proj - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // k_proj - inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1); // v_proj - - // initialize output port std::vector outPortConfigs; - outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); - outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(1), false, -1); - outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(2), false, -1); + + auto rtPrecision = getOriginalInputPrecisionAtPort(0); + + if (rtPrecision == ov::element::f32) { + // fallback to supported precision if possible + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16)) { + rtPrecision = ov::element::f16; + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + rtPrecision = ov::element::bf16; + } + } + + OPENVINO_ASSERT(rtPrecision == ov::element::bf16 || rtPrecision == ov::element::f16, "Unexpected rtPrecision:", rtPrecision); + + if (m_config.quantized) { + auto weightPrecision = ov::element::i8; + auto wScalePrecision = ov::element::f32; + + inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // q_proj + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // k_proj + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1); // v_proj + inPortConfigs.emplace_back(LayoutType::ncsp, wScalePrecision, getInputShapeAtPort(4), false, -1); // q_proj deq-scale per-OC + inPortConfigs.emplace_back(LayoutType::ncsp, wScalePrecision, getInputShapeAtPort(5), false, -1); // k_proj deq-scale per-OC + inPortConfigs.emplace_back(LayoutType::ncsp, wScalePrecision, getInputShapeAtPort(6), false, -1); // v_proj deq-scale per-OC + + // initialize output port + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(1), false, -1); + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(2), false, -1); + } else { + auto weightPrecision = ov::element::f16; + + // initialize input ports + inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(1), false, -1); // q_proj + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(2), false, -1); // k_proj + inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1); // v_proj + + // initialize output port + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1); + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(1), false, -1); + outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(2), false, -1); + } addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any); } -bool QKVProjection::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage, int concurrency) noexcept { +bool QKVProjection::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage, + int concurrency, + uint64_t fcDynamicQuantizationGroupSize) noexcept { #if defined(OPENVINO_ARCH_X86_64) try { const auto node_qkv = std::dynamic_pointer_cast(op); @@ -277,22 +411,27 @@ bool QKVProjection::isSupportedOperation(const std::shared_ptr& return false; } } - auto proj_pshape1 = op->input_value(1).get_shape(); - auto proj_pshape2 = op->input_value(2).get_shape(); - auto proj_pshape3 = op->input_value(3).get_shape(); - if ((proj_pshape1[1] % CACHE_BLK_K_SIZE) != 0) { + const auto& config = node_qkv->get_config(); + if ((config.hidden_size % CACHE_BLK_K_SIZE) != 0) { errorMessage = "QKVProjection input channel size is not multiple of cache blocking size"; return false; } - if ((proj_pshape1[0] % REG_BLK_K_SIZE) != 0) { + + if (config.quantized && (fcDynamicQuantizationGroupSize < static_cast(config.hidden_size))) { + errorMessage = "QKVProjection input channel only support per-token dynamic quantization"; + return false; + } + + auto reg_blk_k_size = node_qkv->get_config().quantized ? REG_BLK_K_SIZE_I8 : REG_BLK_K_SIZE; + if ((config.proj_size0 % reg_blk_k_size) != 0) { errorMessage = "QKVProjection 1st proj output channel size is not multiple of register blocking size"; return false; } - if ((proj_pshape2[0] % REG_BLK_K_SIZE) != 0) { + if ((config.proj_size1 % reg_blk_k_size) != 0) { errorMessage = "QKVProjection 2nd proj output channel size is not multiple of register blocking size"; return false; } - if ((proj_pshape3[0] % REG_BLK_K_SIZE) != 0) { + if ((config.proj_size2 % reg_blk_k_size) != 0) { errorMessage = "QKVProjection 3rd proj output channel size is not multiple of register blocking size"; return false; } diff --git a/src/plugins/intel_cpu/src/nodes/qkv_proj.h b/src/plugins/intel_cpu/src/nodes/qkv_proj.h index 7717d00e5de206..7a94b3e704f976 100644 --- a/src/plugins/intel_cpu/src/nodes/qkv_proj.h +++ b/src/plugins/intel_cpu/src/nodes/qkv_proj.h @@ -23,17 +23,29 @@ class QKVProjection : public Node { bool created() const override { return getType() == Type::QKVProjection; } - void prepareParams() override; + bool needPrepareParams() const override { + return false; + } + void createPrimitive() override; void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage, int concurrency = 0) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage, + int concurrency = 0, + uint64_t fcDynamicQuantizationGroupSize = 0) noexcept; private: - struct Impl; - std::shared_ptr m_pimpl; + struct ExecutorBase { + virtual void execute() = 0; + virtual ~ExecutorBase() = default; + }; + std::shared_ptr m_executor; + template struct Executor; + + QKVProjectionNode::Config m_config; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp index 2ad643d3ad4380..0706f13c3a5450 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp @@ -25,6 +25,11 @@ bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) { INTERNAL_OP_SCOPE(LLMMLPNode_visit_attributes); visitor.start_structure("config"); visitor.on_attribute("act", m_config.act); + visitor.on_attribute("gate_up_quantized", m_config.gate_up_quantized); + visitor.on_attribute("down_quantized", m_config.down_quantized); + visitor.on_attribute("hidden_size", m_config.hidden_size); + visitor.on_attribute("up_size", m_config.up_size); + visitor.on_attribute("gate_up_combined", m_config.gate_up_combined); visitor.finish_structure(); return true; } @@ -32,7 +37,10 @@ bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) { void LLMMLPNode::validate_and_infer_types() { INTERNAL_OP_SCOPE(LLMMLPNode_validate_and_infer_types); const auto input_size = get_input_size(); - NODE_VALIDATION_CHECK(this, input_size == 4); + size_t expect_input_size = 4; + if (m_config.gate_up_quantized) expect_input_size += 2; + if (m_config.down_quantized) expect_input_size += 1; + NODE_VALIDATION_CHECK(this, input_size == expect_input_size); const auto& ishape = get_input_partial_shape(0); const auto& itype = get_input_element_type(0); @@ -44,6 +52,7 @@ void LLMMLPNode::validate_and_infer_types() { const auto length = ishape[1]; const auto feature = ishape[2]; NODE_VALIDATION_CHECK(this, feature.is_static()); + NODE_VALIDATION_CHECK(this, itype.is_real(), "feature data type must be real"); auto oshape = ishape; oshape[oshape.size() - 1] = w_down_shape[0]; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp index f4f3c72b7e4448..fa1569dbb8e5d9 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp @@ -20,6 +20,11 @@ class LLMMLPNode : public ov::op::Op { struct Config { ACT_FN act; + bool gate_up_quantized; + bool down_quantized; + int hidden_size; + int up_size; + bool gate_up_combined; }; // args: diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.cpp index cb9cc543c40a1e..e770a7f530da30 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.cpp @@ -11,7 +11,7 @@ namespace intel_cpu { void QKVProjectionNode::validate_and_infer_types() { INTERNAL_OP_SCOPE(QKVProjection_validate_and_infer_types); const auto input_size = get_input_size(); - NODE_VALIDATION_CHECK(this, input_size == 4); + NODE_VALIDATION_CHECK(this, input_size == (m_config.quantized ? 7 : 4)); const auto& ishape = get_input_partial_shape(0); const auto& itype = get_input_element_type(0); @@ -23,9 +23,10 @@ void QKVProjectionNode::validate_and_infer_types() { auto oshape0 = ishape; auto oshape1 = ishape; auto oshape2 = ishape; - oshape0[oshape0.size()-1] = get_input_partial_shape(1)[0]; - oshape1[oshape1.size()-1] = get_input_partial_shape(2)[0]; - oshape2[oshape2.size()-1] = get_input_partial_shape(3)[0]; + oshape0[oshape0.size()-1] = m_config.proj_size0; + oshape1[oshape1.size()-1] = m_config.proj_size1; + oshape2[oshape2.size()-1] = m_config.proj_size2; + set_output_type(0, itype, oshape0); set_output_type(1, itype, oshape1); set_output_type(2, itype, oshape2); @@ -34,7 +35,7 @@ void QKVProjectionNode::validate_and_infer_types() { std::shared_ptr QKVProjectionNode::clone_with_new_inputs(const ov::OutputVector& new_args) const { INTERNAL_OP_SCOPE(QKVProjection_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args); + return std::make_shared(new_args, m_config); } } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.hpp index 98f226ccdb4f21..d107515635c8a5 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/qkv_proj.hpp @@ -16,18 +16,29 @@ class QKVProjectionNode : public ov::op::Op { QKVProjectionNode() = default; - // args: - // 0: input - // 1: gate_proj - // 2: up_proj - // 3: down_proj - QKVProjectionNode(const OutputVector& args) : Op(args) { + struct Config { + bool quantized; + int hidden_size; + int proj_size0; + int proj_size1; + int proj_size2; + bool weights_combined; + }; + + QKVProjectionNode(const OutputVector& args, const Config& cfg) : Op(args), m_config(cfg) { validate_and_infer_types(); } void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + const Config& get_config() const { + return m_config; + } + +private: + Config m_config; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp index d1efb226ff1dec..6b31f7d4987cb4 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp @@ -31,18 +31,68 @@ ov::intel_cpu::MLPFusion::MLPFusion() { auto gate_proj_weight_compressed = makePattern({}); // [up_size, down_size] auto gate_proj_weight = makePattern({gate_proj_weight_compressed}, {{"destination_type", "f32"}}); + auto up_proj_weight_compressed = makePattern({}); // [up_size, down_size] auto up_proj_weight = makePattern({up_proj_weight_compressed}, {{"destination_type", "f32"}}); + auto down_proj_weight_compressed = makePattern({}); // [down_size, up_size] auto down_proj_weight = makePattern({down_proj_weight_compressed}, {{"destination_type", "f32"}}); - auto mlp_gate_proj = makePattern({input, gate_proj_weight | gate_proj_weight_compressed}, + + // symmetrically INT8 quantized version + // all 3 layers must be quantized at the same time (checked in callback) + auto gate_proj_weight_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto gate_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto gate_proj_weight_f32 = makePattern({gate_proj_weight_i8}, {{"destination_type", "f32"}}); + auto gate_proj_weight_deq = + makePattern({gate_proj_weight_f32, gate_proj_weight_scales_per_OC}, {{"auto_broadcast", "numpy"}}); + + auto up_proj_weight_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto up_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto up_proj_weight_f32 = makePattern({up_proj_weight_i8}, {{"destination_type", "f32"}}); + auto up_proj_weight_deq = + makePattern({up_proj_weight_f32, up_proj_weight_scales_per_OC}, {{"auto_broadcast", "numpy"}}); + + auto down_proj_weight_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto down_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto down_proj_weight_f32 = makePattern({down_proj_weight_i8}, {{"destination_type", "f32"}}); + auto down_proj_weight_deq = + makePattern({down_proj_weight_f32, down_proj_weight_scales_per_OC}, {{"auto_broadcast", "numpy"}}); + + // gate-up weights are combined + auto gate_up_proj_weight = makeConst(ov::element::f16, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto gate_up_proj_weight_f32 = makePattern({gate_up_proj_weight}, {{"destination_type", "f32"}}); + + auto gate_up_proj_weight_const_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto gate_up_proj_weight_cvt_f32 = makePattern({gate_up_proj_weight_const_i8}, {{"destination_type", "f32"}}); + auto gate_up_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto gate_up_proj_weight_deq = makePattern({gate_up_proj_weight_cvt_f32, gate_up_proj_weight_scales_per_OC}, + {{"auto_broadcast", "numpy"}}); + + auto gate_up_proj = makePattern({input, gate_up_proj_weight_f32 | gate_up_proj_weight_deq}, + {{"transpose_a", false}, {"transpose_b", true}}); + auto gate_up_split_lengths = makeConst(ov::element::i32, + ov::Shape({ + 2, + }), + nullptr); + auto gate_up_proj_split = makePattern({gate_up_proj, -1, gate_up_split_lengths}); + gate_up_proj_split->set_output_size(2); + + auto mlp_gate_proj = makePattern({input, gate_proj_weight | gate_proj_weight_compressed | gate_proj_weight_deq}, {{"transpose_a", false}, {"transpose_b", true}}); // [?,?,up_size] - auto mlp_silu_gate = makePattern({mlp_gate_proj}); - auto mlp_gelu_gate = makePattern({mlp_gate_proj}); - auto mlp_up_proj = makePattern({input, up_proj_weight | up_proj_weight_compressed}, + auto mlp_silu_gate = makePattern({mlp_gate_proj | gate_up_proj_split->output(0)}); + auto mlp_gelu_gate = makePattern({mlp_gate_proj | gate_up_proj_split->output(0)}); + auto mlp_up_proj = makePattern({input, up_proj_weight | up_proj_weight_compressed | up_proj_weight_deq}, {{"transpose_a", false}, {"transpose_b", true}}); - auto mlp_gated_up = makePattern({mlp_silu_gate | mlp_gelu_gate, mlp_up_proj}, {{"auto_broadcast", "numpy"}}); - auto down_proj = makePattern({mlp_gated_up, down_proj_weight | down_proj_weight_compressed}, + + auto mlp_gated_up = + makePattern({mlp_silu_gate | mlp_gelu_gate, mlp_up_proj | gate_up_proj_split->output(1)}, + {{"auto_broadcast", "numpy"}}); + auto down_proj = makePattern({mlp_gated_up, down_proj_weight | down_proj_weight_compressed | down_proj_weight_deq}, {{"transpose_a", false}, {"transpose_b", true}}); // [?,?,down_size] auto result = down_proj; @@ -55,10 +105,56 @@ ov::intel_cpu::MLPFusion::MLPFusion() { const auto& pattern_map = m.get_pattern_value_map(); auto root = m.get_match_root(); - - auto gate_proj_w = pattern_map.at(gate_proj_weight_compressed); - auto up_proj_w = pattern_map.at(up_proj_weight_compressed); - auto down_proj_w = pattern_map.at(down_proj_weight_compressed); + auto src = pattern_map.at(input); + if (!src.get_element_type().is_real()) { + // FakeQuantize, should skip fusion + return false; + } + Output gate_proj_w; + Output up_proj_w; + Output down_proj_w; + + // down projection is harder to quantize w/o causing accuracy problem, so it may be un-quantized instead + bool is_gate_up_quantized_int8 = false; + bool is_down_proj_int8 = false; + bool is_gate_up_combined = false; + if (pattern_map.count(gate_up_proj_weight_const_i8) > 0 && pattern_map.count(down_proj_weight_compressed) > 0) { + //gate-up combined & quantized + is_gate_up_quantized_int8 = true; + is_gate_up_combined = true; + gate_proj_w = pattern_map.at(gate_up_proj_weight_const_i8); + up_proj_w = pattern_map.at(gate_up_proj_weight_const_i8); + down_proj_w = pattern_map.at(down_proj_weight_compressed); + } else if (pattern_map.count(gate_up_proj_weight) > 0 && pattern_map.count(down_proj_weight_compressed) > 0) { + //gate-up combined + is_gate_up_combined = true; + gate_proj_w = pattern_map.at(gate_up_proj_weight); + up_proj_w = pattern_map.at(gate_up_proj_weight); + down_proj_w = pattern_map.at(down_proj_weight_compressed); + } else if (pattern_map.count(gate_proj_weight_compressed) > 0 && pattern_map.count(up_proj_weight_compressed) > 0 && + pattern_map.count(down_proj_weight_compressed) > 0) { + is_gate_up_quantized_int8 = false; + is_down_proj_int8 = false; + gate_proj_w = pattern_map.at(gate_proj_weight_compressed); + up_proj_w = pattern_map.at(up_proj_weight_compressed); + down_proj_w = pattern_map.at(down_proj_weight_compressed); + } else if (pattern_map.count(gate_proj_weight_i8) > 0 && pattern_map.count(up_proj_weight_i8) > 0 && + pattern_map.count(gate_proj_weight_scales_per_OC) > 0 && pattern_map.count(up_proj_weight_scales_per_OC) > 0) { + is_gate_up_quantized_int8 = true; + gate_proj_w = pattern_map.at(gate_proj_weight_i8); + up_proj_w = pattern_map.at(up_proj_weight_i8); + + if (pattern_map.count(down_proj_weight_i8) > 0) { + if (pattern_map.count(down_proj_weight_scales_per_OC) == 0) return false; + is_down_proj_int8 = true; + down_proj_w = pattern_map.at(down_proj_weight_i8); + } else { + is_down_proj_int8 = false; + down_proj_w = pattern_map.at(down_proj_weight_compressed); + } + } else { + return false; + } auto gate_proj_w_pshape = gate_proj_w.get_partial_shape(); auto up_proj_w_pshape = up_proj_w.get_partial_shape(); @@ -84,7 +180,7 @@ ov::intel_cpu::MLPFusion::MLPFusion() { if (down_shape.size() != 2) return false; - auto up_size = up_shape[0]; + auto up_size = is_gate_up_combined ? (up_shape[0] / 2) : (up_shape[0]); auto down_size = up_shape[1]; if (down_shape[0] != down_size) return false; @@ -94,6 +190,12 @@ ov::intel_cpu::MLPFusion::MLPFusion() { LLMMLPNode::Config config; OutputVector new_args; std::shared_ptr gate_act; + + config.gate_up_quantized = is_gate_up_quantized_int8; + config.down_quantized = is_down_proj_int8; + config.hidden_size = down_size; + config.up_size = up_size; + config.gate_up_combined = is_gate_up_combined; if (pattern_map.count(mlp_silu_gate) > 0) { config.act = LLMMLPNode::ACT_FN::SILU; gate_act = mlp_silu_gate; @@ -104,20 +206,35 @@ ov::intel_cpu::MLPFusion::MLPFusion() { return false; } - new_args.push_back(pattern_map.at(input)); + new_args.push_back(src); new_args.push_back(gate_proj_w); new_args.push_back(up_proj_w); new_args.push_back(down_proj_w); + if (is_gate_up_quantized_int8) { + if (is_gate_up_combined) { + new_args.push_back(pattern_map.at(gate_up_proj_weight_scales_per_OC)); + new_args.push_back(pattern_map.at(gate_up_proj_weight_scales_per_OC)); + } else { + new_args.push_back(pattern_map.at(gate_proj_weight_scales_per_OC)); + new_args.push_back(pattern_map.at(up_proj_weight_scales_per_OC)); + } + } + if (is_down_proj_int8) { + new_args.push_back(pattern_map.at(down_proj_weight_scales_per_OC)); + } auto old_node = root; auto new_node = std::make_shared(new_args, config); new_node->set_friendly_name(old_node->get_friendly_name()); - ov::copy_runtime_info({pattern_map.at(mlp_gate_proj).get_node_shared_ptr(), - pattern_map.at(gate_act).get_node_shared_ptr(), - pattern_map.at(mlp_up_proj).get_node_shared_ptr(), + ov::copy_runtime_info({pattern_map.at(gate_act).get_node_shared_ptr(), pattern_map.at(down_proj).get_node_shared_ptr()}, new_node); - + if (is_gate_up_combined) { + ov::copy_runtime_info({pattern_map.at(gate_up_proj).get_node_shared_ptr()}, new_node); + } else { + ov::copy_runtime_info({pattern_map.at(mlp_gate_proj).get_node_shared_ptr(), + pattern_map.at(mlp_up_proj).get_node_shared_ptr()}, new_node); + } // callback is for plugin implementation to check if it can be supported if (!transformation_callback(new_node)) { return false; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.cpp index 11ef6ca09ff4a0..10e3f39801a30a 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.cpp @@ -18,8 +18,8 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "ov_ops/type_relaxed.hpp" #include "transformations/cpu_opset/x64/op/qkv_proj.hpp" -#include "transformations/utils/utils.hpp" #include "transformations/utils/gen_pattern.hpp" +#include "transformations/utils/utils.hpp" using namespace ov::gen_pattern; @@ -28,10 +28,18 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { auto input = makePattern("[?,?,?]"); - auto q_proj_weight = makePattern({}); + auto q_proj_weight_const = makePattern({}); + + auto q_proj_weight_const_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto q_proj_weight_f32 = makePattern({q_proj_weight_const_i8}, {{"destination_type", "f32"}}); + auto q_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto q_proj_weight_deq = + makePattern({q_proj_weight_f32, q_proj_weight_scales_per_OC}, {{"auto_broadcast", "numpy"}}); + auto q_proj_weight_cvt = - makePattern({q_proj_weight}, {{"destination_type", "f32"}}); // [4096,4096] - auto q_proj = makePattern({input, q_proj_weight_cvt | q_proj_weight}, + makePattern({q_proj_weight_const}, {{"destination_type", "f32"}}); // [4096,4096] + auto q_proj = makePattern({input, q_proj_weight_cvt | q_proj_weight_const | q_proj_weight_deq}, {{"transpose_a", false}, {"transpose_b", true}}); // [?,?,4096] auto result = q_proj; @@ -56,7 +64,10 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { return false; } + bool is_quantized_int8 = pattern_map.count(q_proj_weight_const_i8); + OutputVector args = {src}; + OutputVector deq_scales; OutputVector outputs; size_t hidden_size = 0; std::vector proj_size; @@ -69,16 +80,42 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { if (mm->get_transpose_a() != false || mm->get_transpose_b() != true) { return false; } - auto constw = ov::as_type_ptr(mm->input_value(1).get_node_shared_ptr()); - if (!constw) { - auto cvt = ov::as_type_ptr(mm->input_value(1).get_node_shared_ptr()); - if (!cvt) { + + auto mm_input1 = mm->input_value(1).get_node_shared_ptr(); + + std::shared_ptr constw; + std::shared_ptr deq_scale; + + if (is_quantized_int8) { + auto deq_mul = ov::as_type_ptr(mm_input1); + if (!deq_mul) return false; - } + + auto deq_mul_in0 = deq_mul->input_value(0).get_node_shared_ptr(); + auto deq_mul_in1 = deq_mul->input_value(1).get_node_shared_ptr(); + + auto cvt = ov::as_type_ptr(deq_mul_in0); + if (!cvt) + return false; + constw = ov::as_type_ptr(cvt->input_value(0).get_node_shared_ptr()); - } - if (!constw) { - return false; + if (!constw || constw->get_element_type() != ov::element::i8) + return false; + + deq_scale = ov::as_type_ptr(deq_mul_in1); + if (!deq_scale || deq_scale->get_element_type() != ov::element::f32) + return false; + } else { + constw = ov::as_type_ptr(mm_input1); + if (!constw) { + if (auto cvt = ov::as_type_ptr(mm_input1)) { + constw = ov::as_type_ptr(cvt->input_value(0).get_node_shared_ptr()); + } else { + return false; + } + } + if (!constw) + return false; } // input feature size should be the same @@ -91,6 +128,7 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { proj_size.push_back(wshape[0]); args.push_back(constw); + deq_scales.push_back(deq_scale); outputs.push_back(mm->get_default_output()); } @@ -101,9 +139,22 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { if (args.size() != 4) { return false; } + // append dequantize scales at the end + if (is_quantized_int8) { + for (auto& d : deq_scales) + args.push_back(d); + } + + QKVProjectionNode::Config config; + config.quantized = is_quantized_int8; + config.hidden_size = hidden_size; + config.weights_combined = false; + config.proj_size0 = proj_size[0]; + config.proj_size1 = proj_size[1]; + config.proj_size2 = proj_size[2]; auto old_node = root; - auto new_node = std::make_shared(args); + auto new_node = std::make_shared(args, config); new_node->set_friendly_name(old_node->get_friendly_name()); ov::copy_runtime_info({old_node}, new_node); @@ -125,3 +176,107 @@ ov::intel_cpu::QKVProjFusion::QKVProjFusion() { auto m = std::make_shared(result, matcher_name); this->register_matcher(m, callback); } + +ov::intel_cpu::QKVProjFusion2::QKVProjFusion2() { + MATCHER_SCOPE(QKVProjFusion2); + + auto input = makePattern("[?,?,?]"); + + auto qkv_proj_weight_const = makePattern({}); + auto qkv_proj_cvt = makePattern({qkv_proj_weight_const}, {{"destination_type", "f32"}}); + + auto qkv_proj_weight_const_i8 = + makeConst(ov::element::i8, ov::PartialShape({ov::Dimension(), ov::Dimension()}), nullptr); + auto qkv_proj_weight_f32 = makePattern({qkv_proj_weight_const_i8}, {{"destination_type", "f32"}}); + auto qkv_proj_weight_scales_per_OC = makeConst(ov::element::f32, ov::PartialShape({ov::Dimension(), 1}), nullptr); + auto qkv_proj_weight_deq = makePattern({qkv_proj_weight_f32, qkv_proj_weight_scales_per_OC}, + {{"auto_broadcast", "numpy"}}); + + auto qkv_proj = makePattern({input, qkv_proj_cvt | qkv_proj_weight_deq}, + {{"transpose_a", false}, {"transpose_b", true}}); + auto qkv_split_lengths = makePattern({}, {}, "i32[3]"); + auto qkv_split = makePattern({qkv_proj, 2, qkv_split_lengths}); + + auto result = qkv_split->output(0); + + matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + PatternValidator validator(m); + if (!validator) { + return false; + } + + const auto& pattern_map = m.get_pattern_value_map(); + auto root = m.get_match_root(); + + auto node_split_lengths = + ov::as_type_ptr(pattern_map.at(qkv_split_lengths).get_node_shared_ptr()); + if (!node_split_lengths) + return false; + auto split_lengths = node_split_lengths->get_vector(); + if (split_lengths.size() != 3) + return false; + + auto proj_size = split_lengths[0]; + if (split_lengths[1] != proj_size) + return false; + if (split_lengths[2] != proj_size) + return false; + + bool is_quantized_int8 = pattern_map.count(qkv_proj_weight_const_i8); + + std::shared_ptr qkv_proj_weight_node; + if (is_quantized_int8) { + qkv_proj_weight_node = + ov::as_type_ptr(pattern_map.at(qkv_proj_weight_const_i8).get_node_shared_ptr()); + } else { + qkv_proj_weight_node = + ov::as_type_ptr(pattern_map.at(qkv_proj_weight_const).get_node_shared_ptr()); + } + if (!qkv_proj_weight_node) + return false; + + auto w_shape = qkv_proj_weight_node->get_shape(); + if (w_shape[0] != static_cast(proj_size * 3)) + return false; + + QKVProjectionNode::Config config; + config.quantized = is_quantized_int8; + config.hidden_size = w_shape[1]; + config.weights_combined = true; + config.proj_size0 = split_lengths[0]; + config.proj_size1 = split_lengths[1]; + config.proj_size2 = split_lengths[2]; + + OutputVector args = {pattern_map.at(input), qkv_proj_weight_node, qkv_proj_weight_node, qkv_proj_weight_node}; + if (is_quantized_int8) { + auto scales = pattern_map.at(qkv_proj_weight_scales_per_OC).get_node_shared_ptr(); + args.push_back(scales); + args.push_back(scales); + args.push_back(scales); + } + auto old_node = root; + auto new_node = std::make_shared(args, config); + new_node->set_friendly_name(old_node->get_friendly_name()); + ov::copy_runtime_info({old_node}, new_node); + + // callback is for plugin implementation to check if it can be supported + if (!transformation_callback(new_node)) { + return false; + } + + auto vsplit = pattern_map.at(qkv_split).get_node_shared_ptr(); + + for (size_t i = 0; i < vsplit->get_output_size(); i++) { + vsplit->output(i).replace(new_node->output(i)); + } + + new_node->add_node_control_dependents(vsplit); + new_node->add_node_control_dependencies(vsplit); + vsplit->clear_control_dependents(); + + return true; + }; + + auto m = std::make_shared(result, matcher_name); + this->register_matcher(m, callback); +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.hpp index 7127fdabf21626..4c398cb478a340 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/qkv_proj_fusion.hpp @@ -15,5 +15,11 @@ class QKVProjFusion: public ov::pass::MatcherPass { QKVProjFusion(); }; +class QKVProjFusion2: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("QKVProjFusion2", "0"); + QKVProjFusion2(); +}; + } // namespace intel_cpu } // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index c12782831ef5c3..fcf38440b8aa4b 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -842,28 +842,42 @@ void Transformations::PostLpt() { CPU_REGISTER_PASS_ARM64(postLPTPassManager, ov::pass::RoPEFusion, true); CPU_REGISTER_PASS_X64(postLPTPassManager, CausalMaskPreprocessFusion); +#if defined(OPENVINO_ARCH_X86_64) // MLP & QKV fusion optimizations is focused on throughput, only enabled on AMX-bf16 & LLM serving use cases. - auto can_use_amx_bf16 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && (config.inferencePrecision == element::bf16); - if (can_use_amx_bf16) { + auto can_use_amx_bf16_int8 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && (config.inferencePrecision == element::bf16); + auto can_use_amx_fp16 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16) && (config.inferencePrecision == element::f16); + + if (can_use_amx_bf16_int8 || can_use_amx_fp16) { + const auto fcDynamicQuantizationGroupSize = config.fcDynamicQuantizationGroupSize; CPU_REGISTER_PASS_X64(postLPTPassManager, MLPFusion); CPU_SET_CALLBACK_X64(postLPTPassManager, - [](const_node_ptr &node) -> bool { + [fcDynamicQuantizationGroupSize](const_node_ptr &node) -> bool { std::string errorMsg; - return node::LLMMLP::isSupportedOperation(node, errorMsg); + return node::LLMMLP::isSupportedOperation(node, errorMsg, fcDynamicQuantizationGroupSize); }, MLPFusion); size_t concurrency = config.streamExecutorConfig.get_threads_per_stream(); if (concurrency == 0) concurrency = parallel_get_max_threads(); + CPU_REGISTER_PASS_X64(postLPTPassManager, QKVProjFusion); CPU_SET_CALLBACK_X64(postLPTPassManager, - [concurrency](const_node_ptr &node) -> bool { + [=](const_node_ptr &node) -> bool { std::string errorMsg; - return node::QKVProjection::isSupportedOperation(node, errorMsg, concurrency); + return node::QKVProjection::isSupportedOperation(node, errorMsg, concurrency, fcDynamicQuantizationGroupSize); }, QKVProjFusion); + + CPU_REGISTER_PASS_X64(postLPTPassManager, QKVProjFusion2); + CPU_SET_CALLBACK_X64(postLPTPassManager, + [=](const_node_ptr &node) -> bool { + std::string errorMsg; + return node::QKVProjection::isSupportedOperation(node, errorMsg, concurrency, fcDynamicQuantizationGroupSize); + }, + QKVProjFusion2); } +#endif // OPENVINO_ARCH_X86_64 CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp index 7a3745d9e1e4bd..9f74cacb45460f 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp @@ -17,6 +17,7 @@ struct LLMMLPFusionParams { size_t down_size; size_t up_size; std::string act_type; + bool use_dynamic_quant; }; class LLMMLPFusionTest : public testing::WithParamInterface, public ov::test::SubgraphBaseTest { @@ -32,6 +33,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface, result << "down_size=" << obj.param.down_size << "_"; result << "up_size=" << obj.param.up_size << "_"; result << "act_type=" << obj.param.act_type << "_"; + result << "use_dynamic_quant=" << obj.param.use_dynamic_quant << "_"; result << obj.index; return result.str(); } @@ -48,19 +50,42 @@ class LLMMLPFusionTest : public testing::WithParamInterface, auto src = std::make_shared(ov::element::f32, inputDynamicShapes[0]); - auto create_const = [](ov::Shape shape, int resolution) { + auto create_const = [&](size_t OC, size_t IC, int resolution) -> std::shared_ptr { + if (param.use_dynamic_quant) { + ov::test::utils::InputGenerateData in_data; + // range [-128, +127] + in_data.start_from = -64; + in_data.range = 63; + in_data.resolution = 128; + auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::i8, ov::Shape{OC, IC}, in_data); + auto weight_const_i8 = std::make_shared(tensor); + auto weight_const_f32 = std::make_shared(weight_const_i8, ov::element::f32); + + // range after dequantize, [-1, +1] + in_data.start_from = 0; + in_data.range = 1; + in_data.resolution = 128; + auto tensor_scale_per_oc = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data); + auto scale_per_oc = std::make_shared(tensor_scale_per_oc); + + auto weight_deq = std::make_shared(weight_const_f32, scale_per_oc); + return weight_deq; + } + ov::test::utils::InputGenerateData in_data; in_data.start_from = -0.5; in_data.range = 1; in_data.resolution = resolution; - auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::f32, shape, in_data); + auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, IC}, in_data); return std::make_shared(tensor); }; + if (param.use_dynamic_quant) + configuration.insert({ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits::max()}); - auto gate_weight = create_const(ov::Shape{param.up_size, param.down_size}, 100); - auto up_weight = create_const(ov::Shape{param.up_size, param.down_size}, 100); + auto gate_weight = create_const(param.up_size, param.down_size, 100); + auto up_weight = create_const(param.up_size, param.down_size, 100); // down_proj has special cache blocking along K dimension requires lower weight resolution - auto down_weight = create_const(ov::Shape{param.down_size, param.up_size}, 16); + auto down_weight = create_const(param.down_size, param.up_size, 16); auto gate_proj = std::make_shared(src, gate_weight, false, true); auto up_proj = std::make_shared(src, up_weight, false, true); @@ -99,15 +124,13 @@ TEST_P(LLMMLPFusionTest, CompareWithRefs) { namespace { +static ov::test::InputShape ishape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}}; + const std::vector mlp_params = { - {ov::test::InputShape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}}, - 4096 / 4, - 11008 / 4, - "Gelu"}, - {ov::test::InputShape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}}, - 4096 / 4, - 11008 / 4, - "Swish"}, + {ishape, 4096 / 4, 11008 / 4, "Gelu", false}, + {ishape, 4096 / 4, 11008 / 4, "Gelu", true}, + {ishape, 4096 / 4, 11008 / 4, "Swish", false}, + {ishape, 4096 / 4, 11008 / 4, "Swish", true}, }; INSTANTIATE_TEST_SUITE_P(smoke_LLMMLPFusion, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/qkv_proj_fusion.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/qkv_proj_fusion.cpp index 4bb6eaa188b951..854969abfaaeb4 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/qkv_proj_fusion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/qkv_proj_fusion.cpp @@ -18,6 +18,7 @@ struct QKVProjFusionParams { size_t q_proj_size; size_t k_proj_size; size_t v_proj_size; + bool use_dynamic_quant; }; class QKVProjFusionTest : public testing::WithParamInterface, @@ -35,6 +36,7 @@ class QKVProjFusionTest : public testing::WithParamInterface(ov::element::f32, inputDynamicShapes[0]); - auto create_const = [](ov::Shape shape, int resolution) { + auto create_const = [&](size_t OC, size_t IC) -> std::shared_ptr { + if (param.use_dynamic_quant) { + ov::test::utils::InputGenerateData in_data; + // range [-128, +127] + in_data.start_from = -128; + in_data.range = 256; + in_data.resolution = 256; + auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::i8, ov::Shape{OC, IC}, in_data); + auto weight_const_i8 = std::make_shared(tensor); + auto weight_const_f32 = std::make_shared(weight_const_i8, ov::element::f32); + + // range after dequantize, [-1, +1] + in_data.start_from = 0; + in_data.range = 1; + in_data.resolution = 128; + auto tensor_scale_per_oc = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data); + auto scale_per_oc = std::make_shared(tensor_scale_per_oc); + + auto weight_deq = std::make_shared(weight_const_f32, scale_per_oc); + return weight_deq; + } ov::test::utils::InputGenerateData in_data; in_data.start_from = -0.5; in_data.range = 1; - in_data.resolution = resolution; - auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::f32, shape, in_data); + in_data.resolution = 128; + auto tensor = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, IC}, in_data); return std::make_shared(tensor); }; + if (param.use_dynamic_quant) + configuration.insert({ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits::max()}); - auto q_proj_weight = create_const(ov::Shape{param.q_proj_size, param.hidden}, 128); - auto k_proj_weight = create_const(ov::Shape{param.k_proj_size, param.hidden}, 128); - auto v_proj_weight = create_const(ov::Shape{param.v_proj_size, param.hidden}, 128); + auto q_proj_weight = create_const(param.q_proj_size, param.hidden); + auto k_proj_weight = create_const(param.k_proj_size, param.hidden); + auto v_proj_weight = create_const(param.v_proj_size, param.hidden); auto q_proj = std::make_shared(src, q_proj_weight, false, true); auto k_proj = std::make_shared(src, k_proj_weight, false, true); @@ -93,19 +117,16 @@ TEST_P(QKVProjFusionTest, CompareWithRefs) { namespace { // the shape size is divided by a const to reduce test time +static ov::test::InputShape ishape_llama2_7b{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 7, 4096 / 4}}}; +static ov::test::InputShape ishape_qwen2_7b{ov::test::InputShape{ov::PartialShape{-1, -1, 3584 / 2}, {ov::Shape{1, 8, 3584 / 2}, ov::Shape{5, 7, 3584 / 2}}}}; + const std::vector qkv_params = { - // Llama-7B - {ov::test::InputShape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 7, 4096 / 4}}}, - 4096 / 4, - 4096 / 4, - 4096 / 4, - 4096 / 4}, + // Llama-7B with reduced size + {ishape_llama2_7b, 4096 / 4, 4096 / 4, 4096 / 4, 4096 / 4, false}, + {ishape_llama2_7b, 4096 / 4, 4096 / 4, 4096 / 4, 4096 / 4, true}, // Qwen2-7B: hidden_size_per_head:128, num_attention_heads:28, num_key_value_heads:4 - {ov::test::InputShape{ov::PartialShape{-1, -1, 3584 / 2}, {ov::Shape{1, 8, 3584 / 2}, ov::Shape{5, 7, 3584 / 2}}}, - 3584 / 2, - 128 * 28 / 2, - 128 * 4 / 2, - 128 * 4 / 2}, + {ishape_qwen2_7b, 3584 / 2, 128 * 28 / 2, 128 * 4 / 2, 128 * 4 / 2, false}, + {ishape_qwen2_7b, 3584 / 2, 128 * 28 / 2, 128 * 4 / 2, 128 * 4 / 2, true}, }; INSTANTIATE_TEST_SUITE_P(smoke_QKVProjFusion, From 3dfebf82f92cc57afe15b56cb4b7f49882e9231b Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Fri, 25 Oct 2024 13:08:22 +0200 Subject: [PATCH 032/233] [Opset15] Finalize Opset15 (#27141) ### Details: - Finalize Opset15 with remaining ops - Update python api, opset15_tbl and docs ### Tickets: - 138273 --------- Co-authored-by: Sebastian Golebiewski Co-authored-by: Michal Lukaszewski --- .../operation-sets/available-opsets.rst | 6 +- .../available-opsets/opset15.rst | 216 ++++++++++++++++++ .../operation-sets/operation-specs.rst | 11 + .../src/openvino/runtime/opset15/__init__.py | 194 +++++++++++++++- .../include/openvino/opsets/opset15_tbl.hpp | 213 ++++++++++++++++- src/core/tests/opset.cpp | 2 +- .../openvino/frontend/extension/op.hpp | 2 +- 7 files changed, 633 insertions(+), 11 deletions(-) create mode 100644 docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset15.rst diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets.rst index a3028755299b45..ed77c97e21b0ee 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets.rst @@ -10,6 +10,7 @@ Available Operation Sets :maxdepth: 1 :hidden: + available-opsets/opset15 available-opsets/opset14 available-opsets/opset13 available-opsets/opset12 @@ -34,7 +35,9 @@ This topic provides a complete list of available sets of operations supported in :header-rows: 1 * - OpenVINO™ Version - - Actual Operations Set + - Operation Set + * - 2024.5 + - :doc:`opset15 ` * - 2024.0 - :doc:`opset14 ` * - 2023.2 @@ -71,4 +74,3 @@ See Also * :doc:`Operation Sets in OpenVINO <../operation-sets>` * :doc:`OpenVINO IR format <../../openvino-ir-format>` - diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset15.rst new file mode 100644 index 00000000000000..b32d7b71580177 --- /dev/null +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset15.rst @@ -0,0 +1,216 @@ +opset15 +======= + + +.. meta:: + :description: Explore the examples of operation instances expressed as IR + XML snippets in the opset15 operation set, supported in OpenVINO™ + toolkit. + +This specification document describes the ``opset15`` operation set supported in OpenVINO™. +Support for each particular operation from the list below depends on the capabilities of an inference plugin +and may vary among different hardware platforms and devices. Examples of operation instances are provided as IR xml +snippets. The semantics match corresponding OpenVINO operation classes declared in ``namespace opset15``. + + +Table of Contents +################## + +* :doc:`Abs <../operation-specs/arithmetic/abs-1>` +* :doc:`Acos <../operation-specs/arithmetic/acos-1>` +* :doc:`Acosh <../operation-specs/arithmetic/acosh-3>` +* :doc:`AdaptiveAvgPool <../operation-specs/pooling/adaptive-avg-pool-8>` +* :doc:`AdaptiveMaxPool <../operation-specs/pooling/adaptive-max-pool-8>` +* :doc:`Add <../operation-specs/arithmetic/add-1>` +* :doc:`Asin <../operation-specs/arithmetic/asin-1>` +* :doc:`Asinh <../operation-specs/arithmetic/asinh-3>` +* :doc:`Assign <../operation-specs/infrastructure/assign-3>` +* :doc:`Atan <../operation-specs/arithmetic/atan-1>` +* :doc:`Atanh <../operation-specs/arithmetic/atanh-3>` +* :doc:`AvgPool <../operation-specs/pooling/avg-pool-14>` +* :doc:`BatchNormInference <../operation-specs/normalization/batch-norm-inference-5>` +* :doc:`BatchToSpace <../operation-specs/movement/batch-to-space-2>` +* :doc:`BinaryConvolution <../operation-specs/convolution/binary-convolution-1>` +* :doc:`BitwiseAnd <../operation-specs/bitwise/bitwise-and-13>` +* :doc:`BitwiseOr <../operation-specs/bitwise/bitwise-or-13>` +* :doc:`BitwiseXor <../operation-specs/bitwise/bitwise-xor-13>` +* :doc:`BitwiseLeftShift <../operation-specs/bitwise/bitwise-left-shift-15>` +* :doc:`BitwiseRightShift <../operation-specs/bitwise/bitwise-right-shift-15>` +* :doc:`BitwiseNot <../operation-specs/bitwise/bitwise-not-13>` +* :doc:`Broadcast <../operation-specs/movement/broadcast-3>` +* :doc:`Bucketize <../operation-specs/condition/bucketize-3>` +* :doc:`CTCGreedyDecoder <../operation-specs/sequence/ctc-greedy-decoder-1>` +* :doc:`CTCGreedyDecoderSeqLen <../operation-specs/sequence/ctc-greedy-decoder-seq-len-6>` +* :doc:`CTCLoss <../operation-specs/sequence/ctc-loss-4>` +* :doc:`Ceiling <../operation-specs/arithmetic/ceiling-1>` +* :doc:`Clamp <../operation-specs/activation/clamp-1>` +* :doc:`Col2Im <../operation-specs/movement/col2im-15>` +* :doc:`Concat <../operation-specs/movement/concat-1>` +* :doc:`Constant <../operation-specs/infrastructure/constant-1>` +* :doc:`Convert <../operation-specs/type/convert-1>` +* :doc:`ConvertLike <../operation-specs/type/convert-like-1>` +* :doc:`ConvertPromoteTypes <../operation-specs/type/convert-promote-types-14>` +* :doc:`Convolution <../operation-specs/convolution/convolution-1>` +* :doc:`ConvolutionBackpropData <../operation-specs/convolution/convolution-backprop-data-1>` +* :doc:`Cos <../operation-specs/arithmetic/cos-1>` +* :doc:`Cosh <../operation-specs/arithmetic/cosh-1>` +* :doc:`CumSum <../operation-specs/arithmetic/cumsum-3>` +* :doc:`DeformableConvolution <../operation-specs/convolution/deformable-convolution-8>` +* :doc:`DeformablePSROIPooling <../operation-specs/detection/deformable-psroi-pooling-1>` +* :doc:`DepthToSpace <../operation-specs/movement/depth-to-space-1>` +* :doc:`DetectionOutput <../operation-specs/detection/detectionoutput-8>` +* :doc:`DFT <../operation-specs/signals/dft-7>` +* :doc:`Divide <../operation-specs/arithmetic/divide-1>` +* :doc:`Einsum <../operation-specs/matrix/einsum-7>` +* :doc:`Elu <../operation-specs/activation/elu-1>` +* :doc:`EmbeddingBagOffsetsSum <../operation-specs/sparse/embedding-bag-offsets-sum-3>` +* :doc:`EmbeddingBagOffsets <../operation-specs/sparse/embedding-bag-offsets-15>` +* :doc:`EmbeddingBagPackedSum <../operation-specs/sparse/embedding-bag-packed-sum-3>` +* :doc:`EmbeddingBagPacked <../operation-specs/sparse/embedding-bag-packed-15>` +* :doc:`EmbeddingSegmentsSum <../operation-specs/sparse/embedding-segments-sum-3>` +* :doc:`Equal <../operation-specs/comparison/equal-1>` +* :doc:`Erf <../operation-specs/arithmetic/erf-1>` +* :doc:`Exp <../operation-specs/activation/exp-1>` +* :doc:`ExperimentalDetectronDetectionOutput_6 <../operation-specs/detection/experimental-detectron-detection-output-6>` +* :doc:`ExperimentalDetectronGenerateProposalsSingleImage_6 <../operation-specs/detection/experimental-detectron-generate-proposals-single-image-6>` +* :doc:`ExperimentalDetectronPriorGridGenerator_6 <../operation-specs/detection/experimental-detectron-prior-grid-generator-6>` +* :doc:`ExperimentalDetectronROIFeatureExtractor_6 <../operation-specs/detection/experimental-detectron-roi-feature-extractor-6>` +* :doc:`ExperimentalDetectronTopKROIs_6 <../operation-specs/sort/experimental-detectron-top-krois-6>` +* :doc:`ExtractImagePatches <../operation-specs/movement/extract-image-patches-3>` +* :doc:`Eye <../operation-specs/generation/eye-9>` +* :doc:`FakeConvert <../operation-specs/quantization/fake-convert-13>` +* :doc:`FakeQuantize <../operation-specs/quantization/fake-quantize-1>` +* :doc:`Floor <../operation-specs/arithmetic/floor-1>` +* :doc:`FloorMod <../operation-specs/arithmetic/floormod-1>` +* :doc:`Gather <../operation-specs/movement/gather-8>` +* :doc:`GatherElements <../operation-specs/movement/gather-elements-6>` +* :doc:`GatherND <../operation-specs/movement/gather-nd-8>` +* :doc:`GatherTree <../operation-specs/movement/gather-tree-1>` +* :doc:`Gelu <../operation-specs/activation/gelu-7>` +* :doc:`GenerateProposals <../operation-specs/detection/generate-proposals-9>` +* :doc:`Greater <../operation-specs/comparison/greater-1>` +* :doc:`GreaterEqual <../operation-specs/comparison/greater-equal-1>` +* :doc:`GridSample <../operation-specs/image/grid-sample-9>` +* :doc:`GRN <../operation-specs/normalization/grn-1>` +* :doc:`GroupConvolution <../operation-specs/convolution/group-convolution-1>` +* :doc:`GroupConvolutionBackpropData <../operation-specs/convolution/group-convolution-backprop-data-1>` +* :doc:`GroupNormalization <../operation-specs/normalization/group-normalization-12>` +* :doc:`GRUCell <../operation-specs/sequence/gru-cell-3>` +* :doc:`GRUSequence <../operation-specs/sequence/gru-sequence-5>` +* :doc:`HardSigmoid <../operation-specs/activation/hard-sigmoid-1>` +* :doc:`HSigmoid <../operation-specs/activation/hsigmoid-5>` +* :doc:`HSwish <../operation-specs/activation/hswish-4>` +* :doc:`IDFT <../operation-specs/signals/idft-7>` +* :doc:`I420toBGR <../operation-specs/image/i420-to-bgr-8>` +* :doc:`I420toRGB <../operation-specs/image/i420-to-rgb-8>` +* :doc:`If <../operation-specs/condition/if-8>` +* :doc:`Interpolate <../operation-specs/image/interpolate-11>` +* :doc:`Inverse <../operation-specs/matrix/inverse-14>` +* :doc:`IRDFT <../operation-specs/signals/irdft-9>` +* :doc:`IsInf <../operation-specs/comparison/isinf-10>` +* :doc:`IsNaN <../operation-specs/comparison/isnan-10>` +* :doc:`Less <../operation-specs/comparison/less-1>` +* :doc:`LessEqual <../operation-specs/comparison/lessequal-1>` +* :doc:`Log <../operation-specs/arithmetic/log-1>` +* :doc:`LogicalAnd <../operation-specs/logical/logical-and-1>` +* :doc:`LogicalNot <../operation-specs/logical/logical-not-1>` +* :doc:`LogicalOr <../operation-specs/logical/logical-or-1>` +* :doc:`LogicalXor <../operation-specs/logical/logical-xor-1>` +* :doc:`LogSoftmax <../operation-specs/activation/log-soft-max-5>` +* :doc:`Loop <../operation-specs/infrastructure/loop-5>` +* :doc:`LRN <../operation-specs/normalization/lrn-1>` +* :doc:`LSTMCell <../operation-specs/sequence/lstm-cell-1>` +* :doc:`LSTMSequence <../operation-specs/sequence/lstm-sequence-1>` +* :doc:`MatMul <../operation-specs/matrix/matmul-1>` +* :doc:`MatrixNMS <../operation-specs/sort/matrix-non-max-suppression-8>` +* :doc:`MaxPool <../operation-specs/pooling/max-pool-14>` +* :doc:`Maximum <../operation-specs/arithmetic/maximum-1>` +* :doc:`Minimum <../operation-specs/arithmetic/minimum-1>` +* :doc:`Mish <../operation-specs/activation/mish-4>` +* :doc:`Mod <../operation-specs/arithmetic/mod-1>` +* :doc:`MVN <../operation-specs/normalization/mvn-6>` +* :doc:`MulticlassNMS <../operation-specs/sort/multiclass-non-max-suppression-9>` +* :doc:`Multinomial <../operation-specs/generation/multinomial-13>` +* :doc:`Multiply <../operation-specs/arithmetic/multiply-1>` +* :doc:`Negative <../operation-specs/arithmetic/negative-1>` +* :doc:`NMSRotated <../operation-specs/sort/nms-rotated-13>` +* :doc:`NonMaxSuppression <../operation-specs/sort/non-max-suppression-9>` +* :doc:`NonZero <../operation-specs/condition/nonzero-3>` +* :doc:`NormalizeL2 <../operation-specs/normalization/normalize-l2-1>` +* :doc:`NotEqual <../operation-specs/comparison/notequal-1>` +* :doc:`NV12toBGR <../operation-specs/image/nv12-to-bgr-8>` +* :doc:`NV12toRGB <../operation-specs/image/nv12-to-rgb-8>` +* :doc:`OneHot <../operation-specs/sequence/one-hot-1>` +* :doc:`Pad <../operation-specs/movement/pad-12>` +* :doc:`Parameter <../operation-specs/infrastructure/parameter-1>` +* :doc:`Power <../operation-specs/arithmetic/power-1>` +* :doc:`PReLU <../operation-specs/activation/prelu-1>` +* :doc:`PriorBoxClustered <../operation-specs/detection/prior-box-clustered-1>` +* :doc:`PriorBox <../operation-specs/detection/prior-box-8>` +* :doc:`Proposal <../operation-specs/detection/proposal-4>` +* :doc:`PSROIPooling <../operation-specs/detection/psroi-pooling-1>` +* :doc:`RandomUniform <../operation-specs/generation/random-uniform-8>` +* :doc:`Range <../operation-specs/generation/range-4>` +* :doc:`RDFT <../operation-specs/signals/rdft-9>` +* :doc:`ReLU <../operation-specs/activation/relu-1>` +* :doc:`ReadValue <../operation-specs/infrastructure/read-value-3>` +* :doc:`ReduceL1 <../operation-specs/reduction/reduce-l1-4>` +* :doc:`ReduceL2 <../operation-specs/reduction/reduce-l2-4>` +* :doc:`ReduceLogicalAnd <../operation-specs/reduction/reduce-logical-and-1>` +* :doc:`ReduceLogicalOr <../operation-specs/reduction/reduce-logical-or-1>` +* :doc:`ReduceMax <../operation-specs/reduction/reduce-max-1>` +* :doc:`ReduceMean <../operation-specs/reduction/reduce-mean-1>` +* :doc:`ReduceMin <../operation-specs/reduction/reduce-min-1>` +* :doc:`ReduceProd <../operation-specs/reduction/reduce-prod-1>` +* :doc:`ReduceSum <../operation-specs/reduction/reduce-sum-1>` +* :doc:`RegionYolo <../operation-specs/detection/region-yolo-1>` +* :doc:`ReorgYolo <../operation-specs/detection/reorg-yolo-1>` +* :doc:`Reshape <../operation-specs/shape/reshape-1>` +* :doc:`Result <../operation-specs/infrastructure/result-1>` +* :doc:`ReverseSequence <../operation-specs/movement/reverse-sequence-1>` +* :doc:`RNNCell <../operation-specs/sequence/rnn-cell-3>` +* :doc:`RNNSequence <../operation-specs/sequence/rnn-sequence-5>` +* :doc:`ROIAlign <../operation-specs/detection/roi-align-9>` +* :doc:`ROIAlignRotated <../operation-specs/detection/roi-align-rotated-15>` +* :doc:`ROIPooling <../operation-specs/detection/roi-pooling-1>` +* :doc:`Roll <../operation-specs/movement/roll-7>` +* :doc:`Round <../operation-specs/arithmetic/round-5>` +* :doc:`ScaledDotProductAttention <../operation-specs/sequence/scaled-dot-product-attention>` +* :doc:`ScatterElementsUpdate <../operation-specs/movement/scatter-elements-update-12>` +* :doc:`ScatterNDUpdate <../operation-specs/movement/scatter-nd-update-15>` +* :doc:`ScatterUpdate <../operation-specs/movement/scatter-update-3>` +* :doc:`SearchSorted <../operation-specs/sort/search-sorted-15>` +* :doc:`Select <../operation-specs/condition/select-1>` +* :doc:`Selu <../operation-specs/activation/selu-1>` +* :doc:`ShapeOf <../operation-specs/shape/shape-of-3>` +* :doc:`ShuffleChannels <../operation-specs/movement/shuffle-channels-1>` +* :doc:`Sigmoid <../operation-specs/activation/sigmoid-1>` +* :doc:`Sign <../operation-specs/arithmetic/sign-1>` +* :doc:`Sin <../operation-specs/arithmetic/sin-1>` +* :doc:`Sinh <../operation-specs/arithmetic/sinh-1>` +* :doc:`Slice <../operation-specs/movement/slice-8>` +* :doc:`SliceScatter <../operation-specs/movement/slice-scatter-15>` +* :doc:`SoftMax <../operation-specs/activation/softmax-8>` +* :doc:`SoftPlus <../operation-specs/activation/softplus-4>` +* :doc:`SoftSign <../operation-specs/activation/softsign-9>` +* :doc:`SpaceToBatch <../operation-specs/movement/space-to-batch-2>` +* :doc:`SpaceToDepth <../operation-specs/movement/space-to-depth-1>` +* :doc:`Split <../operation-specs/movement/split-1>` +* :doc:`Sqrt <../operation-specs/arithmetic/sqrt-1>` +* :doc:`SquaredDifference <../operation-specs/arithmetic/squared-difference-1>` +* :doc:`Squeeze <../operation-specs/shape/squeeze-1>` +* :doc:`STFT <../operation-specs/signals/stft-15>` +* :doc:`StridedSlice <../operation-specs/movement/strided-slice-1>` +* :doc:`StringTensorPack <../operation-specs/type/string-tensor-pack-15>` +* :doc:`StringTensorUnpack <../operation-specs/type/string-tensor-unpack-15>` +* :doc:`Subtract <../operation-specs/arithmetic/subtract-1>` +* :doc:`Swish <../operation-specs/activation/swish-4>` +* :doc:`Tan <../operation-specs/arithmetic/tan-1>` +* :doc:`Tanh <../operation-specs/arithmetic/tanh-1>` +* :doc:`TensorIterator <../operation-specs/infrastructure/tensor-iterator-1>` +* :doc:`Tile <../operation-specs/movement/tile-1>` +* :doc:`TopK <../operation-specs/sort/top-k-11>` +* :doc:`Transpose <../operation-specs/movement/transpose-1>` +* :doc:`Unique <../operation-specs/movement/unique-10>` +* :doc:`Unsqueeze <../operation-specs/shape/unsqueeze-1>` +* :doc:`VariadicSplit <../operation-specs/movement/variadic-split-1>` diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst index 7ac47116595621..6ecbf2695699f9 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst @@ -29,8 +29,10 @@ Operation Specifications BatchToSpace-2 BinaryConvolution-1 BitwiseAnd-13 + BitwiseLeftShift-15 BitwiseNot-13 BitwiseOr-13 + BitwiseRightShift-15 BitwiseXor-13 Broadcast-1 Broadcast-3 @@ -39,6 +41,7 @@ Operation Specifications CTCGreedyDecoderSeqLen-6 Ceiling-1 Clamp-1 + Col2Im-15 Concat-1 Constant-1 ConvertLike-1 @@ -61,7 +64,9 @@ Operation Specifications Einsum-7 Elu-1 EmbeddingBagOffsetsSum-3 + EmbeddingBagOffsets-15 EmbeddingBagPackedSum-3 + EmbeddingBagPacked-15 EmbeddingSegmentsSum-3 Equal-1 Erf-1 @@ -189,6 +194,7 @@ Operation Specifications RNNSequence-5 ROIAlign-3 ROIAlign-9 + ROIAlignRotated-15 ROIPooling-1 Roll-7 Round-5 @@ -196,6 +202,7 @@ Operation Specifications ScatterElementsUpdate-3 ScatterElementsUpdate-12 ScatterNDUpdate-3 + ScatterNDUpdate-15 ScatterUpdate-3 SearchSorted-15 Select-1 @@ -208,6 +215,7 @@ Operation Specifications Sin-1 Sinh-1 Slice-8 + SliceScatter SoftMax-1 SoftMax-8 SoftPlus-4 @@ -218,7 +226,10 @@ Operation Specifications Sqrt-1 SquaredDifference-1 Squeeze-1 + STFT-15 StridedSlice-1 + StringTensorPack-15 + StringTensorUnpack-15 Subtract-1 Swish-4 Tan-1 diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py index a12225f719a55c..6cc9c24827a85f 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py @@ -2,10 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Inlcudes new operators added in Opset15 - -# TODO (ticket 138273): Add previous opset operators at the end of opset15 development -from openvino.runtime.opset1.ops import parameter +# New operations added in Opset15 from openvino.runtime.opset15.ops import col2im from openvino.runtime.opset15.ops import embedding_bag_offsets from openvino.runtime.opset15.ops import embedding_bag_packed @@ -16,5 +13,192 @@ from openvino.runtime.opset15.ops import bitwise_left_shift from openvino.runtime.opset15.ops import bitwise_right_shift from openvino.runtime.opset15.ops import slice_scatter -from openvino.runtime.opset15.ops import stft + +# Operators from previous opsets +from openvino.runtime.opset1.ops import absolute +from openvino.runtime.opset1.ops import absolute as abs +from openvino.runtime.opset1.ops import acos +from openvino.runtime.opset4.ops import acosh +from openvino.runtime.opset8.ops import adaptive_avg_pool +from openvino.runtime.opset8.ops import adaptive_max_pool +from openvino.runtime.opset1.ops import add +from openvino.runtime.opset1.ops import asin +from openvino.runtime.opset4.ops import asinh +from openvino.runtime.opset6.ops import assign +from openvino.runtime.opset1.ops import atan +from openvino.runtime.opset4.ops import atanh +from openvino.runtime.opset14.ops import avg_pool +from openvino.runtime.opset5.ops import batch_norm_inference +from openvino.runtime.opset2.ops import batch_to_space +from openvino.runtime.opset1.ops import binary_convolution +from openvino.runtime.opset13.ops import bitwise_and +from openvino.runtime.opset13.ops import bitwise_not +from openvino.runtime.opset13.ops import bitwise_or +from openvino.runtime.opset13.ops import bitwise_xor +from openvino.runtime.opset3.ops import broadcast +from openvino.runtime.opset3.ops import bucketize +from openvino.runtime.opset1.ops import ceiling +from openvino.runtime.opset1.ops import ceiling as ceil +from openvino.runtime.opset1.ops import clamp +from openvino.runtime.opset1.ops import concat +from openvino.runtime.opset13.ops import constant +from openvino.runtime.opset1.ops import convert +from openvino.runtime.opset1.ops import convert_like +from openvino.runtime.opset14.ops import convert_promote_types +from openvino.runtime.opset1.ops import convolution +from openvino.runtime.opset1.ops import convolution_backprop_data +from openvino.runtime.opset1.ops import cos +from openvino.runtime.opset1.ops import cosh +from openvino.runtime.opset1.ops import ctc_greedy_decoder +from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.runtime.opset4.ops import ctc_loss +from openvino.runtime.opset3.ops import cum_sum +from openvino.runtime.opset3.ops import cum_sum as cumsum +from openvino.runtime.opset8.ops import deformable_convolution +from openvino.runtime.opset1.ops import deformable_psroi_pooling +from openvino.runtime.opset1.ops import depth_to_space +from openvino.runtime.opset8.ops import detection_output +from openvino.runtime.opset7.ops import dft +from openvino.runtime.opset1.ops import divide +from openvino.runtime.opset7.ops import einsum +from openvino.runtime.opset1.ops import elu +from openvino.runtime.opset3.ops import embedding_bag_offsets_sum +from openvino.runtime.opset3.ops import embedding_bag_packed_sum +from openvino.runtime.opset3.ops import embedding_segments_sum +from openvino.runtime.opset3.ops import extract_image_patches +from openvino.runtime.opset1.ops import equal +from openvino.runtime.opset1.ops import erf +from openvino.runtime.opset1.ops import exp +from openvino.runtime.opset9.ops import eye +from openvino.runtime.opset13.ops import fake_convert +from openvino.runtime.opset13.ops import fake_quantize +from openvino.runtime.opset1.ops import floor +from openvino.runtime.opset1.ops import floor_mod +from openvino.runtime.opset8.ops import gather +from openvino.runtime.opset6.ops import gather_elements +from openvino.runtime.opset8.ops import gather_nd +from openvino.runtime.opset1.ops import gather_tree +from openvino.runtime.opset7.ops import gelu +from openvino.runtime.opset9.ops import generate_proposals +from openvino.runtime.opset1.ops import greater +from openvino.runtime.opset1.ops import greater_equal +from openvino.runtime.opset9.ops import grid_sample +from openvino.runtime.opset1.ops import grn +from openvino.runtime.opset1.ops import group_convolution +from openvino.runtime.opset1.ops import group_convolution_backprop_data +from openvino.runtime.opset12.ops import group_normalization +from openvino.runtime.opset3.ops import gru_cell +from openvino.runtime.opset5.ops import gru_sequence +from openvino.runtime.opset1.ops import hard_sigmoid +from openvino.runtime.opset5.ops import hsigmoid +from openvino.runtime.opset4.ops import hswish +from openvino.runtime.opset7.ops import idft +from openvino.runtime.opset8.ops import if_op +from openvino.runtime.opset11.ops import interpolate +from openvino.runtime.opset14.ops import inverse +from openvino.runtime.opset9.ops import irdft +from openvino.runtime.opset10.ops import is_finite +from openvino.runtime.opset10.ops import is_inf +from openvino.runtime.opset10.ops import is_nan +from openvino.runtime.opset8.ops import i420_to_bgr +from openvino.runtime.opset8.ops import i420_to_rgb +from openvino.runtime.opset1.ops import less +from openvino.runtime.opset1.ops import less_equal +from openvino.runtime.opset1.ops import log +from openvino.runtime.opset1.ops import logical_and +from openvino.runtime.opset1.ops import logical_not +from openvino.runtime.opset1.ops import logical_or +from openvino.runtime.opset1.ops import logical_xor +from openvino.runtime.opset5.ops import log_softmax +from openvino.runtime.opset5.ops import loop +from openvino.runtime.opset1.ops import lrn +from openvino.runtime.opset4.ops import lstm_cell +from openvino.runtime.opset5.ops import lstm_sequence +from openvino.runtime.opset1.ops import matmul +from openvino.runtime.opset8.ops import matrix_nms +from openvino.runtime.opset14.ops import max_pool +from openvino.runtime.opset1.ops import maximum +from openvino.runtime.opset1.ops import minimum +from openvino.runtime.opset4.ops import mish +from openvino.runtime.opset1.ops import mod +from openvino.runtime.opset9.ops import multiclass_nms +from openvino.runtime.opset13.ops import multinomial +from openvino.runtime.opset1.ops import multiply +from openvino.runtime.opset6.ops import mvn +from openvino.runtime.opset1.ops import negative +from openvino.runtime.opset13.ops import nms_rotated +from openvino.runtime.opset9.ops import non_max_suppression +from openvino.runtime.opset3.ops import non_zero +from openvino.runtime.opset1.ops import normalize_l2 +from openvino.runtime.opset1.ops import not_equal +from openvino.runtime.opset8.ops import nv12_to_bgr +from openvino.runtime.opset8.ops import nv12_to_rgb +from openvino.runtime.opset1.ops import one_hot +from openvino.runtime.opset12.ops import pad +from openvino.runtime.opset1.ops import parameter +from openvino.runtime.opset1.ops import power +from openvino.runtime.opset1.ops import prelu +from openvino.runtime.opset8.ops import prior_box +from openvino.runtime.opset1.ops import prior_box_clustered +from openvino.runtime.opset1.ops import psroi_pooling +from openvino.runtime.opset4.ops import proposal +from openvino.runtime.opset4.ops import range +from openvino.runtime.opset8.ops import random_uniform +from openvino.runtime.opset9.ops import rdft +from openvino.runtime.opset6.ops import read_value +from openvino.runtime.opset4.ops import reduce_l1 +from openvino.runtime.opset4.ops import reduce_l2 +from openvino.runtime.opset1.ops import reduce_logical_and +from openvino.runtime.opset1.ops import reduce_logical_or +from openvino.runtime.opset1.ops import reduce_max +from openvino.runtime.opset1.ops import reduce_mean +from openvino.runtime.opset1.ops import reduce_min +from openvino.runtime.opset1.ops import reduce_prod +from openvino.runtime.opset1.ops import reduce_sum +from openvino.runtime.opset1.ops import region_yolo +from openvino.runtime.opset2.ops import reorg_yolo +from openvino.runtime.opset1.ops import relu +from openvino.runtime.opset1.ops import reshape +from openvino.runtime.opset13.ops import result +from openvino.runtime.opset1.ops import reverse_sequence +from openvino.runtime.opset3.ops import rnn_cell +from openvino.runtime.opset5.ops import rnn_sequence +from openvino.runtime.opset9.ops import roi_align +from openvino.runtime.opset2.ops import roi_pooling +from openvino.runtime.opset7.ops import roll +from openvino.runtime.opset5.ops import round +from openvino.runtime.opset13.ops import scaled_dot_product_attention +from openvino.runtime.opset12.ops import scatter_elements_update +from openvino.runtime.opset3.ops import scatter_update from openvino.runtime.opset15.ops import search_sorted +from openvino.runtime.opset1.ops import select +from openvino.runtime.opset1.ops import selu +from openvino.runtime.opset3.ops import shape_of +from openvino.runtime.opset3.ops import shuffle_channels +from openvino.runtime.opset1.ops import sigmoid +from openvino.runtime.opset1.ops import sign +from openvino.runtime.opset1.ops import sin +from openvino.runtime.opset1.ops import sinh +from openvino.runtime.opset8.ops import slice +from openvino.runtime.opset8.ops import softmax +from openvino.runtime.opset4.ops import softplus +from openvino.runtime.opset9.ops import softsign +from openvino.runtime.opset2.ops import space_to_batch +from openvino.runtime.opset1.ops import space_to_depth +from openvino.runtime.opset1.ops import split +from openvino.runtime.opset1.ops import sqrt +from openvino.runtime.opset1.ops import squared_difference +from openvino.runtime.opset1.ops import squeeze +from openvino.runtime.opset15.ops import stft +from openvino.runtime.opset1.ops import strided_slice +from openvino.runtime.opset1.ops import subtract +from openvino.runtime.opset4.ops import swish +from openvino.runtime.opset1.ops import tan +from openvino.runtime.opset1.ops import tanh +from openvino.runtime.opset1.ops import tensor_iterator +from openvino.runtime.opset1.ops import tile +from openvino.runtime.opset11.ops import topk +from openvino.runtime.opset1.ops import transpose +from openvino.runtime.opset10.ops import unique +from openvino.runtime.opset1.ops import unsqueeze +from openvino.runtime.opset1.ops import variadic_split diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp index a18093c4ef3f5c..9a49e421f9ad8e 100644 --- a/src/core/include/openvino/opsets/opset15_tbl.hpp +++ b/src/core/include/openvino/opsets/opset15_tbl.hpp @@ -7,11 +7,220 @@ # define _OPENVINO_OP_REG(x, y) #endif -// Previous opsets operators -_OPENVINO_OP_REG(Parameter, ov::op::v0) +_OPENVINO_OP_REG(Abs, ov::op::v0) +_OPENVINO_OP_REG(Acos, ov::op::v0) +_OPENVINO_OP_REG(Add, ov::op::v1) +_OPENVINO_OP_REG(Asin, ov::op::v0) +_OPENVINO_OP_REG(Atan, ov::op::v0) +_OPENVINO_OP_REG(AvgPool, ov::op::v14) +_OPENVINO_OP_REG(BatchNormInference, ov::op::v5) +_OPENVINO_OP_REG(BinaryConvolution, ov::op::v1) +_OPENVINO_OP_REG(Broadcast, ov::op::v3) +_OPENVINO_OP_REG(Bucketize, ov::op::v3) +_OPENVINO_OP_REG(CTCGreedyDecoder, ov::op::v0) +_OPENVINO_OP_REG(Ceiling, ov::op::v0) +_OPENVINO_OP_REG(Clamp, ov::op::v0) +_OPENVINO_OP_REG(Concat, ov::op::v0) +_OPENVINO_OP_REG(Constant, ov::op::v0) _OPENVINO_OP_REG(Convert, ov::op::v0) +_OPENVINO_OP_REG(ConvertLike, ov::op::v1) +_OPENVINO_OP_REG(Convolution, ov::op::v1) +_OPENVINO_OP_REG(ConvolutionBackpropData, ov::op::v1) +_OPENVINO_OP_REG(Cos, ov::op::v0) +_OPENVINO_OP_REG(Cosh, ov::op::v0) +_OPENVINO_OP_REG(CumSum, ov::op::v0) +_OPENVINO_OP_REG(DeformablePSROIPooling, ov::op::v1) +_OPENVINO_OP_REG(DepthToSpace, ov::op::v0) +_OPENVINO_OP_REG(Divide, ov::op::v1) +_OPENVINO_OP_REG(Elu, ov::op::v0) +_OPENVINO_OP_REG(Erf, ov::op::v0) +_OPENVINO_OP_REG(Equal, ov::op::v1) +_OPENVINO_OP_REG(Exp, ov::op::v0) +_OPENVINO_OP_REG(ExtractImagePatches, ov::op::v3) +_OPENVINO_OP_REG(FakeQuantize, ov::op::v0) +_OPENVINO_OP_REG(Floor, ov::op::v0) +_OPENVINO_OP_REG(FloorMod, ov::op::v1) +_OPENVINO_OP_REG(GatherTree, ov::op::v1) +_OPENVINO_OP_REG(Greater, ov::op::v1) +_OPENVINO_OP_REG(GreaterEqual, ov::op::v1) +_OPENVINO_OP_REG(GridSample, ov::op::v9) +_OPENVINO_OP_REG(GroupConvolution, ov::op::v1) +_OPENVINO_OP_REG(GroupConvolutionBackpropData, ov::op::v1) +_OPENVINO_OP_REG(GRN, ov::op::v0) +_OPENVINO_OP_REG(HardSigmoid, ov::op::v0) +_OPENVINO_OP_REG(Less, ov::op::v1) +_OPENVINO_OP_REG(LessEqual, ov::op::v1) +_OPENVINO_OP_REG(Log, ov::op::v0) +_OPENVINO_OP_REG(LogicalAnd, ov::op::v1) +_OPENVINO_OP_REG(LogicalNot, ov::op::v1) +_OPENVINO_OP_REG(LogicalOr, ov::op::v1) +_OPENVINO_OP_REG(LogicalXor, ov::op::v1) +_OPENVINO_OP_REG(LRN, ov::op::v0) +_OPENVINO_OP_REG(LSTMCell, ov::op::v4) +_OPENVINO_OP_REG(MatMul, ov::op::v0) +_OPENVINO_OP_REG(Maximum, ov::op::v1) +_OPENVINO_OP_REG(Minimum, ov::op::v1) +_OPENVINO_OP_REG(Mod, ov::op::v1) +_OPENVINO_OP_REG(Multiply, ov::op::v1) +_OPENVINO_OP_REG(Negative, ov::op::v0) +_OPENVINO_OP_REG(NormalizeL2, ov::op::v0) +_OPENVINO_OP_REG(NotEqual, ov::op::v1) +_OPENVINO_OP_REG(OneHot, ov::op::v1) +_OPENVINO_OP_REG(PRelu, ov::op::v0) +_OPENVINO_OP_REG(PSROIPooling, ov::op::v0) +_OPENVINO_OP_REG(Parameter, ov::op::v0) +_OPENVINO_OP_REG(Power, ov::op::v1) +_OPENVINO_OP_REG(PriorBoxClustered, ov::op::v0) +_OPENVINO_OP_REG(Proposal, ov::op::v4) +_OPENVINO_OP_REG(Range, ov::op::v4) +_OPENVINO_OP_REG(Relu, ov::op::v0) +_OPENVINO_OP_REG(ReduceMax, ov::op::v1) +_OPENVINO_OP_REG(ReduceLogicalAnd, ov::op::v1) +_OPENVINO_OP_REG(ReduceLogicalOr, ov::op::v1) +_OPENVINO_OP_REG(ReduceMean, ov::op::v1) +_OPENVINO_OP_REG(ReduceMin, ov::op::v1) +_OPENVINO_OP_REG(ReduceProd, ov::op::v1) +_OPENVINO_OP_REG(ReduceSum, ov::op::v1) +_OPENVINO_OP_REG(RegionYolo, ov::op::v0) +_OPENVINO_OP_REG(ReorgYolo, ov::op::v0) +_OPENVINO_OP_REG(Reshape, ov::op::v1) +_OPENVINO_OP_REG(Result, ov::op::v0) +_OPENVINO_OP_REG(ReverseSequence, ov::op::v0) +_OPENVINO_OP_REG(ROIPooling, ov::op::v0) +_OPENVINO_OP_REG(Select, ov::op::v1) +_OPENVINO_OP_REG(Selu, ov::op::v0) +_OPENVINO_OP_REG(Sign, ov::op::v0) +_OPENVINO_OP_REG(Sigmoid, ov::op::v0) +_OPENVINO_OP_REG(Sin, ov::op::v0) +_OPENVINO_OP_REG(Sinh, ov::op::v0) +_OPENVINO_OP_REG(Sqrt, ov::op::v0) +_OPENVINO_OP_REG(SpaceToDepth, ov::op::v0) +_OPENVINO_OP_REG(Split, ov::op::v1) +_OPENVINO_OP_REG(SquaredDifference, ov::op::v0) +_OPENVINO_OP_REG(Squeeze, ov::op::v0) +_OPENVINO_OP_REG(StridedSlice, ov::op::v1) +_OPENVINO_OP_REG(Subtract, ov::op::v1) +_OPENVINO_OP_REG(Tan, ov::op::v0) +_OPENVINO_OP_REG(Tanh, ov::op::v0) +_OPENVINO_OP_REG(TensorIterator, ov::op::v0) +_OPENVINO_OP_REG(Tile, ov::op::v0) +_OPENVINO_OP_REG(Transpose, ov::op::v1) +_OPENVINO_OP_REG(Unsqueeze, ov::op::v0) +_OPENVINO_OP_REG(VariadicSplit, ov::op::v1) + +// New operations added in opset2 +_OPENVINO_OP_REG(BatchToSpace, ov::op::v1) +_OPENVINO_OP_REG(SpaceToBatch, ov::op::v1) + +// New operations added in opset3 +_OPENVINO_OP_REG(EmbeddingBagPackedSum, ov::op::v3) +_OPENVINO_OP_REG(EmbeddingSegmentsSum, ov::op::v3) +_OPENVINO_OP_REG(EmbeddingBagOffsetsSum, ov::op::v3) +_OPENVINO_OP_REG(GRUCell, ov::op::v3) +_OPENVINO_OP_REG(NonZero, ov::op::v3) +_OPENVINO_OP_REG(RNNCell, ov::op::v0) +_OPENVINO_OP_REG(ScatterUpdate, ov::op::v3) +_OPENVINO_OP_REG(ShuffleChannels, ov::op::v0) _OPENVINO_OP_REG(ShapeOf, ov::op::v3) +// New operations added in opset4 +_OPENVINO_OP_REG(Acosh, ov::op::v3) +_OPENVINO_OP_REG(Asinh, ov::op::v3) +_OPENVINO_OP_REG(Atanh, ov::op::v3) +_OPENVINO_OP_REG(CTCLoss, ov::op::v4) +_OPENVINO_OP_REG(HSwish, ov::op::v4) +_OPENVINO_OP_REG(Mish, ov::op::v4) +_OPENVINO_OP_REG(ReduceL1, ov::op::v4) +_OPENVINO_OP_REG(ReduceL2, ov::op::v4) +_OPENVINO_OP_REG(SoftPlus, ov::op::v4) +_OPENVINO_OP_REG(Swish, ov::op::v4) + +// New operations added in opset5 +_OPENVINO_OP_REG(GRUSequence, ov::op::v5) +_OPENVINO_OP_REG(HSigmoid, ov::op::v5) +_OPENVINO_OP_REG(LogSoftmax, ov::op::v5) +_OPENVINO_OP_REG(Loop, ov::op::v5) +_OPENVINO_OP_REG(LSTMSequence, ov::op::v5) +_OPENVINO_OP_REG(RNNSequence, ov::op::v5) +_OPENVINO_OP_REG(Round, ov::op::v5) + +// New operations added in opset6 +_OPENVINO_OP_REG(CTCGreedyDecoderSeqLen, ov::op::v6) +_OPENVINO_OP_REG(ExperimentalDetectronDetectionOutput, ov::op::v6) +_OPENVINO_OP_REG(ExperimentalDetectronGenerateProposalsSingleImage, ov::op::v6) +_OPENVINO_OP_REG(ExperimentalDetectronPriorGridGenerator, ov::op::v6) +_OPENVINO_OP_REG(ExperimentalDetectronROIFeatureExtractor, ov::op::v6) +_OPENVINO_OP_REG(ExperimentalDetectronTopKROIs, ov::op::v6) +_OPENVINO_OP_REG(GatherElements, ov::op::v6) +_OPENVINO_OP_REG(MVN, ov::op::v6) +_OPENVINO_OP_REG(Assign, ov::op::v6) // new version +_OPENVINO_OP_REG(ReadValue, ov::op::v6) // new version + +// New operations added in opset7 +_OPENVINO_OP_REG(DFT, ov::op::v7) +_OPENVINO_OP_REG(Einsum, ov::op::v7) +_OPENVINO_OP_REG(Gelu, ov::op::v7) +_OPENVINO_OP_REG(IDFT, ov::op::v7) +_OPENVINO_OP_REG(Roll, ov::op::v7) + +// New operations added in opset8 +_OPENVINO_OP_REG(Gather, ov::op::v8) +_OPENVINO_OP_REG(GatherND, ov::op::v8) +_OPENVINO_OP_REG(AdaptiveAvgPool, ov::op::v8) +_OPENVINO_OP_REG(AdaptiveMaxPool, ov::op::v8) +_OPENVINO_OP_REG(DeformableConvolution, ov::op::v8) +_OPENVINO_OP_REG(DetectionOutput, ov::op::v8) +_OPENVINO_OP_REG(I420toBGR, ov::op::v8) +_OPENVINO_OP_REG(I420toRGB, ov::op::v8) +_OPENVINO_OP_REG(MatrixNms, ov::op::v8) +_OPENVINO_OP_REG(MaxPool, ov::op::v14) +_OPENVINO_OP_REG(NV12toBGR, ov::op::v8) +_OPENVINO_OP_REG(NV12toRGB, ov::op::v8) +_OPENVINO_OP_REG(RandomUniform, ov::op::v8) +_OPENVINO_OP_REG(Slice, ov::op::v8) +_OPENVINO_OP_REG(Softmax, ov::op::v8) +_OPENVINO_OP_REG(If, ov::op::v8) +_OPENVINO_OP_REG(PriorBox, ov::op::v8) + +// New operations added in opset9 +_OPENVINO_OP_REG(IRDFT, ov::op::v9) +_OPENVINO_OP_REG(RDFT, ov::op::v9) +_OPENVINO_OP_REG(Eye, ov::op::v9) +_OPENVINO_OP_REG(NonMaxSuppression, ov::op::v9) +_OPENVINO_OP_REG(ROIAlign, ov::op::v9) +_OPENVINO_OP_REG(SoftSign, ov::op::v9) +_OPENVINO_OP_REG(GenerateProposals, ov::op::v9) +_OPENVINO_OP_REG(MulticlassNms, ov::op::v9) + +// New operations added in opset10 +_OPENVINO_OP_REG(IsFinite, ov::op::v10) +_OPENVINO_OP_REG(IsInf, ov::op::v10) +_OPENVINO_OP_REG(IsNaN, ov::op::v10) +_OPENVINO_OP_REG(Unique, ov::op::v10) + +// New operations added in opset11 +_OPENVINO_OP_REG(Interpolate, ov::op::v11) +_OPENVINO_OP_REG(TopK, ov::op::v11) + +// New operations added in opset12 +_OPENVINO_OP_REG(GroupNormalization, ov::op::v12) +_OPENVINO_OP_REG(Pad, ov::op::v12) +_OPENVINO_OP_REG(ScatterElementsUpdate, ov::op::v12) + +// New operations added in opset13 +_OPENVINO_OP_REG(BitwiseAnd, ov::op::v13) +_OPENVINO_OP_REG(BitwiseNot, ov::op::v13) +_OPENVINO_OP_REG(BitwiseOr, ov::op::v13) +_OPENVINO_OP_REG(BitwiseXor, ov::op::v13) +_OPENVINO_OP_REG(NMSRotated, ov::op::v13) +_OPENVINO_OP_REG(Multinomial, ov::op::v13) +_OPENVINO_OP_REG(ScaledDotProductAttention, ov::op::v13) +_OPENVINO_OP_REG(FakeConvert, ov::op::v13) + +// New operations added in opset14 +_OPENVINO_OP_REG(ConvertPromoteTypes, ov::op::v14) +_OPENVINO_OP_REG(Inverse, ov::op::v14) + // New operations added in opset15 _OPENVINO_OP_REG(ROIAlignRotated, ov::op::v15) _OPENVINO_OP_REG(ScatterNDUpdate, ov::op::v15) diff --git a/src/core/tests/opset.cpp b/src/core/tests/opset.cpp index 2df8bade6a2f2c..3a2a590de43edf 100644 --- a/src/core/tests/opset.cpp +++ b/src/core/tests/opset.cpp @@ -75,7 +75,7 @@ INSTANTIATE_TEST_SUITE_P(opset, OpsetTestParams{ov::get_opset12, 178}, OpsetTestParams{ov::get_opset13, 186}, OpsetTestParams{ov::get_opset14, 188}, - OpsetTestParams{ov::get_opset15, 15}), + OpsetTestParams{ov::get_opset15, 199}), OpsetTestNameGenerator{}); class MyOpOld : public ov::op::Op { diff --git a/src/frontends/common/include/openvino/frontend/extension/op.hpp b/src/frontends/common/include/openvino/frontend/extension/op.hpp index 4198c411082e42..55fa919447406a 100644 --- a/src/frontends/common/include/openvino/frontend/extension/op.hpp +++ b/src/frontends/common/include/openvino/frontend/extension/op.hpp @@ -25,7 +25,7 @@ inline const ov::OpSet& get_opset_by_name(const std::string& opset_name) { if (opsets.find(opset_name) != opsets.end()) return opsets.at(opset_name)(); if (opset_name.empty() || opset_name == "latest") { - return ov::get_opset14(); // TODO (ticket 138273): Update at the end of the opset15 development + return ov::get_opset15(); } else { FRONT_END_GENERAL_CHECK(false, "Unsupported opset name: ", opset_name); } From 2f3a93e4fb5090f2c1e7ab9165ae82de165ac365 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Fri, 25 Oct 2024 13:34:19 +0200 Subject: [PATCH 033/233] [CPU][ACL] Refactor weights packaging in ACL FC executor (#27065) ### Details: - Fixed issue: empty `arm_compute::WeightsInfo` was passed into `has_opt_impl`, so preferred weight format was never selected. - Performance improved: ACL reorder, transpose and convert were replaced with oneDNN primitives. TODO: - [x] Run performance checks ### Tickets: - CVS-155912 --- .../executors/acl/acl_common_executor.cpp | 1 - .../executors/acl/acl_common_executor.hpp | 4 + .../executors/acl/acl_fullyconnected.cpp | 362 ++++++++++-------- .../executors/acl/acl_fullyconnected.hpp | 26 +- 4 files changed, 213 insertions(+), 180 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp index 5779147a5b3352..646cf47c1bcf6c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp @@ -83,7 +83,6 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) { updateTensorsShapes(aclMemoryShapes); // Initialize arm_compute::TensorInfo objects - ACLInfos aclMemoryInfos; for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) { aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp index 854130d6f884bb..1a5a00c7a85f7a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp @@ -43,12 +43,16 @@ class ACLCommonExecutor : public Executor { } void execute(const MemoryArgs& memory) override; bool update(const MemoryArgs& memory) override; + arm_compute::TensorInfo& getTensorInfo(ACLArgs index) { + return *aclMemoryInfos[index].get(); + } ~ACLCommonExecutor(); protected: ACLTensorAttrs aclTensorAttrs; private: ACLTensors aclMemoryTensors; + ACLInfos aclMemoryInfos; ACLFunction iFunction = nullptr; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 6f464abf33d036..cc42691950a3ff 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -2,6 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#include + +#include "ov_optional.hpp" #include "acl_fullyconnected.hpp" #include "acl_utils.hpp" #include "nodes/executors/executor.hpp" @@ -9,7 +13,10 @@ #include "utils/debug_capabilities.h" #include "nodes/executors/debug_messages.hpp" #include "nodes/executors/implementation_utils.hpp" +#include "nodes/convert.h" #include "nodes/common/cpu_convert.h" +#include "nodes/common/cpu_memcpy.h" +#include "nodes/common/reorder_prim.h" #include "memory_desc/cpu_memory_desc_utils.h" namespace ov { @@ -46,104 +53,133 @@ static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDim return outputShape; } -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - const FCAttrs &attrs, - const ACLFCAttrs& aclfcAttrs, - const PostOps &postOps) { - DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); - const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; +static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, + const DnnlMemoryDescPtr dstDesc) { + const auto& weiDesc = srcDesc->getDnnlDesc(); + const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; + const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); - auto create = [&]() { - MemoryPtr final_ptr = memory.at(ARG_WEI); - // Convert weights precision - if (aclfcAttrs.isConvertedWeights) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memoryArgs[ARG_SRC_0]->getDescPtr()->cloneWithNewPrecision( - aclfcAttrs.inputPrecision)); - auto aclWeightsConverter = std::make_shared(); - if (aclWeightsConverter->update(memoryArgs)) { - aclWeightsConverter->execute(memoryArgs); - } else { - auto count_wei_elem = std::accumulate(memoryArgs[ARG_SRC_0]->getStaticDims().begin(), - memoryArgs[ARG_SRC_0]->getStaticDims().end(), - 1, - std::multiplies<>()); - cpu_convert(memoryArgs[ARG_SRC_0]->getData(), - memoryArgs[ARG_DST]->getData(), - memoryArgs[ARG_SRC_0]->getPrecision(), - memoryArgs[ARG_DST]->getPrecision(), - count_wei_elem); - } - final_ptr = memoryArgs[ARG_DST]; + return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); +} + +static ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC] = input; + memoryArgs[ARG_DST] = output; + + auto aclWeightsConverter = std::make_shared(); + if (aclWeightsConverter->update(memoryArgs)) { + aclWeightsConverter->execute(memoryArgs); + return ov::optional(memoryArgs.at(ARG_DST)); + } + + if (!node::Convert::isSupportedDesc(input->getDesc()) || + !node::Convert::isSupportedDesc(output->getDesc())) { + return {}; + } + + auto data = static_cast(input->getData()); + std::vector tmpBuff; + tmpBuff.resize(output->getSize()); + cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), + weightPrecision, input->getSize() / input->getDesc().getPrecision().size()); + + return ov::optional(std::make_shared(output->getPrimitive().get_engine(), + output->getDesc().cloneWithNewPrecision(weightPrecision), + tmpBuff.data())); +} + +static ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) { + if (output->getDataType() == input->getDataType()) { + return {}; + } + const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType()); + auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc); + dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(), + output->getPrimitive().get_engine(), + input->getPrimitive().get_desc(), + MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); + + if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { + auto convertOutput = convertWeightPrecision(input, output, inPrc); + if (!convertOutput) { + return {}; + } + input = *convertOutput; + + if (reorderWithoutConvert) { + dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); + reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + return ov::optional(output); } - // Packed weights - { - arm_compute::WeightFormat expectedWeightFormat; - bool isNeededReorder; - { - MemoryArgs memoryArgs; - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = final_ptr; - if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { - const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = final_ptr->getShape(); - const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); - memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), - memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); - } else { - memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_DST] = memory.at(ARG_DST); - } - auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); - isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat(); - } - if (isNeededReorder) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memoryArgs[ARG_SRC_0]->getDescPtr()->clone()); - auto aclWeightsReorder = std::make_shared( - arm_compute::WeightFormat::OHWI, expectedWeightFormat); - if (aclWeightsReorder->update(memoryArgs)) { - aclWeightsReorder->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } + } + return {}; +} + +static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc, + DnnlMemoryDescPtr dstWeightDesc, + MemoryCPtr weightsMem, + ExecutorContext::CPtr context) { + MemoryPtr input = std::make_shared(context->getEngine(), srcWeightDesc, weightsMem->getData()); + MemoryPtr output = std::make_shared(context->getEngine(), dstWeightDesc); + if (!input->getDesc().isDefined() || !output->getDesc().isDefined()) + OPENVINO_THROW("Can't reorder data with dynamic shapes"); + + if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) { + return output; + } + + if (input->getDesc().isCompatible(output->getDesc())) { + auto srcPtr = static_cast(input->getData()); + auto dstPtr = static_cast(output->getData()); + auto copySize = output->getSize(); + cpu_memcpy(dstPtr, srcPtr, copySize); + return output; + } + + // try directly reorder + auto engine = output->getPrimitive().get_engine(); + dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(), + engine, + input->getPrimitive().get_desc(), + output->getPrimitive().get_desc()); + + if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) { + // try precision conversion then do the reorder + auto fallbackOutput = reorderDataFallback(input, output, context); + if (fallbackOutput) { + return *fallbackOutput; } - // Transpose weights - if (!aclfcAttrs.weightsNonTransposed) { - auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); - if (reverse_weights_dims.size() == 3) { - reverse_weights_dims = VectorDims( - {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); - } - std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - CpuBlockedMemoryDesc(final_ptr->getPrecision(), - intel_cpu::Shape(reverse_weights_dims))); - auto aclWeightsTranspose = std::make_shared(); - if (aclWeightsTranspose->update(memoryArgs)) { - aclWeightsTranspose->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } + } + // if precision conversion does not work then do direct reference reorder + if (directReorder) { + dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); + directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + } else { + OPENVINO_THROW("Could not make onednn reorder."); + } + return output; +} + +static MemoryPtr reorderWeights(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + ACLFCAttrs& aclfcAttrs, + DnnlMemoryDescPtr dnnlSrcDesc, + DnnlMemoryDescPtr dnnlDstDesc) { + auto create = [&]() { + MemoryPtr weightsMemory = memory.at(ARG_WEI); + if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { + weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context); + DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); } - DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); - return final_ptr; + return weightsMemory; }; auto weightCache = context->getWeightsCache(); if (weightCache != nullptr) { + const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); + const auto N = wgtDims[0]; + const auto K = wgtDims[1]; std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); @@ -155,6 +191,63 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, return create(); } +static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + const FCAttrs &attrs, + ACLFCAttrs& aclfcAttrs, + const PostOps &postOps, + arm_compute::WeightFormat& expectedWeightFormat, + arm_compute::TensorInfo& weiTensorInfo) { + MemoryArgs memoryArgs; + memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); + memoryArgs[ARG_WEI] = memory.at(ARG_WEI); + if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { + const auto& inShape = memory.at(ARG_SRC_0)->getShape(); + const auto& wShape = memory.at(ARG_WEI)->getShape(); + const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); + const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); + memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), + memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); + } else { + memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); + memoryArgs[ARG_DST] = memory.at(ARG_DST); + } + // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor + // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability + auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); + bool isNeededReorder = aclWeightsRepack->update(memoryArgs); + expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; + weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); + + MemoryPtr dstMemPtr = std::make_shared(context->getEngine(), + memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision)); + auto dstDesc = dstMemPtr->getDescPtr(); + auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); + auto weiDesc = memory.at(ARG_WEI)->getDescPtr(); + auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc); + + if (isNeededReorder) { + dnnl::impl::dim_t o_dim = 0; + dnnl::impl::dim_t inner_dim = 1; + std::vector remaining_dims = {}; + auto weights_md_ = dnnlDstDesc->getDnnlDesc().get(); + dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat, + inner_dim, o_dim, remaining_dims, {}); + if (aclfcAttrs.weightsNonTransposed) { + dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc); + } + aclfcAttrs.isWeightsRepacked = true; + return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); + } + if (!aclfcAttrs.weightsNonTransposed) { + dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc); + aclfcAttrs.isWeightsRepacked = true; + } + return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); +} + static bool checkPostOps(const PostOps &postOps) { if (postOps.empty()) { return true; @@ -199,7 +292,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const MemoryArgs &memory, const ExecutorContext::CPtr context) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps); + packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo); } bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { @@ -212,25 +305,15 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { return true; } -static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { - if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { - aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( - {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_WEI][2]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][0], - aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_SRC_0][1]}); - } +static arm_compute::TensorShape normalizeDimsTo2D(const arm_compute::TensorShape shape) { + size_t norm_dim = std::accumulate(shape.begin() + 1, shape.end(), 1, std::multiplies()); + return arm_compute::TensorShape(shape[0], norm_dim); +} +static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { + aclMemoryShapes[ACLArgs::ACL_WEI] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_WEI]); + aclMemoryShapes[ACLArgs::ACL_SRC_0] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_SRC_0]); + aclMemoryShapes[ACLArgs::ACL_DST] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_DST]); std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); } @@ -242,26 +325,33 @@ arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfo if (aclfcAttrs.isConvertedWeights) { aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); } + int ic_total = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); return arm_compute::NEFullyConnectedLayer::validate( aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_WEI].get(), + &weiTensorInfo, aclMemoryInfos[ACLArgs::ACL_BIAS].get(), aclMemoryInfos[ACLArgs::ACL_DST].get(), fullyConnectedLayerInfo, - weightsInfo); + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? + arm_compute::WeightsInfo() : + arm_compute::WeightsInfo(false, 1, 1, ic_total, false, expectedWeightFormat)); } ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclMemoryTensors) { auto neFC = std::make_unique(); + aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->init(weiTensorInfo); + int icTotal = aclMemoryTensors[ACLArgs::ACL_WEI]->info()->dimension(0); neFC->configure( aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), aclMemoryTensors[ACLArgs::ACL_WEI].get(), aclMemoryTensors[ACLArgs::ACL_BIAS].get(), aclMemoryTensors[ACLArgs::ACL_DST].get(), fullyConnectedLayerInfo, - weightsInfo); - - if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) { + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? + arm_compute::WeightsInfo() : + arm_compute::WeightsInfo(false, 1, 1, icTotal, false, expectedWeightFormat)); + // TODO: get rid of those flags and decide whether to import memory or not just based on input type + if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false; aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData()); } @@ -282,19 +372,6 @@ ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTen return neCast; } - -arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -} - -ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) { - auto neTranspose = std::make_unique(); - neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neTranspose; -} - acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, const PostOps &postOps, const MemoryArgs &memory) { @@ -309,6 +386,7 @@ arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsIn if (aclfcAttrs.isConvertedWeights) { aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); } + int icTotal = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); return arm_compute::NEFullyConnectedLayer::has_opt_impl( expectedWeightFormat, aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), @@ -316,40 +394,12 @@ arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsIn aclMemoryInfos[ACLArgs::ACL_BIAS].get(), aclMemoryInfos[ACLArgs::ACL_DST].get(), fullyConnectedLayerInfo, - weightsInfo); + arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY)); } ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) { return std::make_unique(); } -arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { -#if defined(OPENVINO_ARCH_ARM64) - return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); -#else - return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -#endif -} - -ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) { -#if defined(OPENVINO_ARCH_ARM64) - auto neReorderLayer = std::make_unique(); - neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); - return neReorderLayer; -#else - auto neCopy = std::make_unique(); - neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neCopy; -#endif -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index 4d7f2e5ef91480..fcbcb1475efa15 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -13,6 +13,7 @@ namespace intel_cpu { struct ACLFCAttrs { ov::element::Type inputPrecision; bool isConvertedWeights = false; + bool isWeightsRepacked = false; bool weightsNonTransposed; }; @@ -26,14 +27,6 @@ class ACLWeightsConverter : public ACLCommonExecutor { ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; }; -class ACLWeightsTranspose : public ACLCommonExecutor { -public: - ACLWeightsTranspose() = default; - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -}; - class ACLWeightFormatGenerator : public ACLCommonExecutor { public: ACLWeightFormatGenerator(const FCAttrs& attrs, @@ -47,24 +40,10 @@ class ACLWeightFormatGenerator : public ACLCommonExecutor { } private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; - arm_compute::WeightsInfo weightsInfo; ACLFCAttrs aclfcAttrs; arm_compute::WeightFormat expectedWeightFormat; }; -class ACLWeightsReorder : public ACLCommonExecutor { -public: - ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat, - arm_compute::WeightFormat outWeightFormat) - : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {} - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -private: - arm_compute::WeightFormat inWeightFormat; - arm_compute::WeightFormat outWeightFormat; -}; - } // namespace acl_fc_executor class ACLFullyConnectedExecutor : public ACLCommonExecutor { @@ -84,9 +63,10 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor { private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; - arm_compute::WeightsInfo weightsInfo; + arm_compute::WeightFormat expectedWeightFormat; MemoryCPtr packedWeights; ACLFCAttrs aclfcAttrs; + arm_compute::TensorInfo weiTensorInfo; }; using ACLFullyConnectedExecutorPtr = std::shared_ptr; From 884ac4a129ac1d9c2b34e411cb7d53f97b29c879 Mon Sep 17 00:00:00 2001 From: Pawel Raasz Date: Fri, 25 Oct 2024 13:47:55 +0200 Subject: [PATCH 034/233] [Transformations] SharedOpOptimization and SymbolicOptimization update (#27243) ### Details: - SharedOpOptimization: optimization keeps the same topological order. It allows for seamless optimization in one run over the model - OptimizeSymbolsUsedAsValues: partial values containing symbols and constant values are now participating in the value optimization too - Continuation of PR 27153 to fix tests ### Tickets: - N/A ### Related PR: - #27153 --------- Co-authored-by: Evgeniia Nugmanova --- .../shared_ops_optimization.cpp | 14 +- .../symbol_optimization.cpp | 75 +++++++++ .../shared_ops_optimization.cpp | 55 +++++++ .../symbol_optimization.cpp | 153 ++++++++++++++++++ .../cpu_opset/common/pass/ngram_fusion.cpp | 21 ++- 5 files changed, 312 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp b/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp index 226093143e68d8..188d0b07684098 100644 --- a/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp @@ -103,8 +103,11 @@ bool nodes_are_equal(const std::shared_ptr& lhs, const std::shared_ptr& model) { bool rewritten = false; - - for (const auto& op : model->get_ordered_ops()) { + std::unordered_map, size_t> index_map; + const auto& order = model->get_ordered_ops(); + for (size_t i = 0; i < order.size(); ++i) + index_map[order[i]] = i; + for (const auto& op : order) { // Recursively apply transformation for sub-graph based operations if (auto multi_subgraph_op = dynamic_pointer_cast(op)) { for (const auto& sub_graph : multi_subgraph_op->get_functions()) { @@ -124,6 +127,13 @@ bool shared_node_optimization(const shared_ptr& model) { auto& shared_nodes = item.second; if (shared_nodes.size() < 2) continue; + // sort shared_nodes so that root would be the earliest in the topological order + // it is critical for continuous application of this optimization + std::sort(shared_nodes.begin(), + shared_nodes.end(), + [&index_map](const std::shared_ptr& a, const std::shared_ptr& b) { + return index_map[a] < index_map[b]; + }); std::vector visited_nodes(shared_nodes.size(), false); for (size_t i = 0; i < visited_nodes.size(); ++i) { diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp index 27790904f4360b..55f0794e0ee008 100644 --- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp +++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp @@ -7,6 +7,7 @@ #include "itt.hpp" #include "openvino/core/bound_evaluation_util.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/core/tensor_util.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/op/add.hpp" #include "openvino/op/concat.hpp" @@ -354,12 +355,85 @@ void save_shape_sources(const std::shared_ptr& op, STS_map& symbol_sha } } } + +struct OutputValue { + std::vector value; + + bool operator==(const OutputValue& other) const { + return value == other.value; + } + + bool operator<(const OutputValue& other) const { + return std::lexicographical_compare( + std::begin(value), + std::end(value), + std::begin(other.value), + std::end(other.value), + [](const ov::Any& a, const ov::Any& b) { + // each element is either a symbol or an integer. in case they differ any integer is less than a symbol. + if (a.is>() && b.is>()) + return a.as>() < b.as>(); + if (a.is() && b.is()) + return a.as() < b.as(); + return a.is(); + }); + } + + static ov::optional make(const ov::Output& output) { + auto symbols = output.get_tensor().get_value_symbol(); + if (symbols.empty() || symbols.size() == 1) + return {}; + + const auto& lower_value = ov::util::to_vector(output.get_tensor().get_lower_value()); + const auto& upper_value = ov::util::to_vector(output.get_tensor().get_upper_value()); + const auto& et = output.get_element_type(); + bool use_values = lower_value && upper_value && (et == ov::element::i64 || et == ov::element::i32); + + std::vector symbols_as_any(symbols.size(), nullptr); + for (size_t i = 0; i < symbols_as_any.size(); ++i) { + if (use_values && lower_value->at(i) == upper_value->at(i)) + symbols_as_any[i] = lower_value->at(i); + else if (symbols.at(i) != nullptr) + symbols_as_any[i] = ov::symbol::ancestor_of(symbols.at(i)); + else + return {}; + } + return {OutputValue{std::move(symbols_as_any)}}; + } +}; + +void save_and_update_value_sources(const std::shared_ptr& op, + std::map>& multi_symbol_source) { + for (auto& output : op->outputs()) { + if (output.get_tensor().get_value_symbol().size() < 2) + continue; // singular values are handled by optimize_value_usage helper + + if (auto result = OutputValue::make(output)) { + if (multi_symbol_source.count(*result)) { + auto alternative_source = multi_symbol_source[*result]; + if (output.get_element_type() != alternative_source.get_element_type()) { + auto convert = std::make_shared(alternative_source, output.get_element_type()); + ov::copy_runtime_info(output.get_node_shared_ptr(), convert); + alternative_source = convert->output(0); + } + if (output.get_partial_shape().is_dynamic() || + output.get_partial_shape() != alternative_source.get_partial_shape()) + continue; + output.replace(alternative_source); + } else { + multi_symbol_source[*result] = output; + } + } + } +} + } // namespace bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptr& m) { RUN_ON_FUNCTION_SCOPE(OptimizeSymbolsUsedAsValues); STS_map symbol_shape_source; STS_map symbol_value_source; + std::map> multi_symbol_source; for (const auto& op : topological_order(m)) { // Result has output port which has shared (during validate_and_infer_type) tensor with input port. // Transformations may replace input of Result. After replacement and before Result::validate_and_infer_type -- @@ -375,6 +449,7 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptroutputs()) optimize_value_usage(output, symbol_shape_source, symbol_value_source); save_shape_sources(op, symbol_shape_source); + save_and_update_value_sources(op, multi_symbol_source); } return true; } diff --git a/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp b/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp index 3f717726b757bc..dcba44313c8c7e 100644 --- a/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp +++ b/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp @@ -6,6 +6,7 @@ #include #include "common_test_utils/ov_test_utils.hpp" +#include "openvino/op/add.hpp" #include "openvino/op/broadcast.hpp" #include "openvino/op/ceiling.hpp" #include "openvino/op/concat.hpp" @@ -716,3 +717,57 @@ TEST_F(SharedTransformationTestsF, SharedMaxPool) { model_ref = std::make_shared(OutputVector{concat}, ParameterVector{data}); } } + +TEST_F(SharedTransformationTestsF, TopologicalOrder) { + { + auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + + auto shape_of = std::make_shared(data); + + auto gather_0 = std::make_shared(shape_of, + v0::Constant::create(element::i32, {1}, {0}), + v0::Constant::create(element::i32, {}, {0})); + + auto gather_1 = std::make_shared(shape_of, + v0::Constant::create(element::i32, {1}, {0}), + v0::Constant::create(element::i32, {}, {0})); + + auto gather_2 = std::make_shared(shape_of, + v0::Constant::create(element::i32, {1}, {0}), + v0::Constant::create(element::i32, {}, {0})); + + auto add_0 = std::make_shared(gather_0, gather_0); + auto add_1 = std::make_shared(gather_1, gather_1); + auto add_2 = std::make_shared(gather_2, gather_2); + + auto concat_0 = + std::make_shared(OutputVector{gather_0, add_0, v0::Constant::create(element::i64, {1}, {0})}, + 0); + auto concat_1 = + std::make_shared(OutputVector{gather_1, add_1, v0::Constant::create(element::i64, {1}, {0})}, + 0); + auto concat_2 = + std::make_shared(OutputVector{gather_2, add_2, v0::Constant::create(element::i64, {1}, {0})}, + 0); + + auto concat = std::make_shared(OutputVector{concat_0, concat_1}, 0); + auto output = std::make_shared(OutputVector{concat, concat_2}, 0); + + model = std::make_shared(OutputVector{output}, ParameterVector{data}); + manager.register_pass(); + } + { + auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + auto shape_of = std::make_shared(data); + auto gather_0 = std::make_shared(shape_of, + v0::Constant::create(element::i32, {1}, {0}), + v0::Constant::create(element::i32, {}, {0})); + auto add_0 = std::make_shared(gather_0, gather_0); + auto concat_0 = + std::make_shared(OutputVector{gather_0, add_0, v0::Constant::create(element::i64, {1}, {0})}, + 0); + auto concat = std::make_shared(OutputVector{concat_0, concat_0}, 0); + auto output = std::make_shared(OutputVector{concat, concat_0}, 0); + model_ref = std::make_shared(OutputVector{output}, ParameterVector{data}); + } +} diff --git a/src/common/transformations/tests/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/tests/symbolic_transformations/symbol_optimization.cpp index 590290fb19d84b..e4653ec084bafb 100644 --- a/src/common/transformations/tests/symbolic_transformations/symbol_optimization.cpp +++ b/src/common/transformations/tests/symbolic_transformations/symbol_optimization.cpp @@ -9,11 +9,15 @@ #include "common_test_utils/ov_test_utils.hpp" #include "openvino/op/add.hpp" #include "openvino/op/concat.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/gather.hpp" +#include "openvino/op/range.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/shape_of.hpp" +#include "openvino/op/squeeze.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/visualize_tree.hpp" +#include "transformations/common_optimizations/shared_ops_optimization.hpp" #include "transformations/symbolic_transformations/symbolic_optimizations.hpp" #include "transformations/symbolic_transformations/utils.hpp" @@ -95,3 +99,152 @@ TEST_F(TransformationTestsF, ApplySymbolEquivalence_Concat_Values) { model_ref = make_shared(NodeVector{reshape}, ParameterVector{input_2, input_1}); } } + +Output get_dim_by_idx(const Output& source, const int64_t& idx, element::Type type = element::i64) { + auto shape = make_shared(source, type); + auto gather = make_shared(shape, + v0::Constant::create(element::i64, {}, {idx}), + v0::Constant::create(element::i64, {}, {0})); + return gather->output(0); +} + +Output get_dim_by_idx(const Output& source, + initializer_list idx, + element::Type type = element::i64) { + auto shape = make_shared(source, type); + auto gather = make_shared(shape, + v0::Constant::create(element::i64, {idx.size()}, idx), + v0::Constant::create(element::i64, {}, {0})); + return gather->output(0); +} + +TEST_F(TransformationTestsF, ValueOptimizationSingleValue) { + { + auto input = make_shared(element::f32, PartialShape::dynamic(4)); + + auto dim_0 = get_dim_by_idx(input, {-1}, element::i64); + auto dim_1 = get_dim_by_idx(input, {3}, element::i32); + auto dim_2 = get_dim_by_idx(input, -1, element::i32); + + auto reshape_0 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i64, {1}, {-1}), dim_0}, 0), + false); + auto reshape_1 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i32, {1}, {0}), dim_1}, 0), + false); + auto range = make_shared(v0::Constant::create(element::i32, {}, {0}), + dim_2, + v0::Constant::create(element::i32, {}, {1}), + element::i32); + + model = make_shared(NodeVector{reshape_0, reshape_1, range}, ParameterVector{input}); + + manager.set_per_pass_validation(false); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + } + { + auto input = make_shared(element::f32, PartialShape::dynamic(4)); + auto dim_1 = get_dim_by_idx(input, {3}, element::i32); + auto dim_0 = std::make_shared(dim_1, element::i64); + auto dim_2 = std::make_shared(dim_1); + auto reshape_0 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i64, {1}, {-1}), dim_0}, 0), + false); + auto reshape_1 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i32, {1}, {0}), dim_1}, 0), + false); + auto range = make_shared(v0::Constant::create(element::i32, {}, {0}), + dim_2, + v0::Constant::create(element::i32, {}, {1}), + element::i32); + + model_ref = make_shared(NodeVector{reshape_0, reshape_1, range}, ParameterVector{input}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +TEST_F(TransformationTestsF, ValueOptimizationDoubleValue) { + { + auto input = make_shared(element::f32, PartialShape::dynamic(4)); + + auto dim_0 = get_dim_by_idx(input, {-1, -2}, element::i64); + auto dim_1 = get_dim_by_idx(input, {3, 2}, element::i32); + + auto reshape_0 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i64, {1}, {-1}), dim_0}, 0), + false); + auto reshape_1 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i32, {1}, {0}), dim_1}, 0), + false); + + model = make_shared(NodeVector{reshape_0, reshape_1}, ParameterVector{input}); + + manager.set_per_pass_validation(false); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + } + { + auto input = make_shared(element::f32, PartialShape::dynamic(4)); + auto dim_0 = get_dim_by_idx(input, {3, 2}, element::i32); + auto dim_1 = std::make_shared(dim_0, element::i64); + + auto reshape_0 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i64, {1}, {-1}), dim_1}, 0), + false); + auto reshape_1 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i32, {1}, {0}), dim_0}, 0), + false); + + model_ref = make_shared(NodeVector{reshape_0, reshape_1}, ParameterVector{input}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +TEST_F(TransformationTestsF, ValueOptimizationSymbolAndValue) { + { + auto input = make_shared(element::f32, PartialShape({-1, -1, 4, -1})); + + auto dim_0 = get_dim_by_idx(input, {-1, -2}, element::i64); + auto dim_1 = get_dim_by_idx(input, {3, 2}, element::i32); + + auto reshape_0 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i64, {1}, {-1}), dim_0}, 0), + false); + auto reshape_1 = make_shared( + input, + make_shared(OutputVector{v0::Constant::create(element::i32, {1}, {-1}), dim_1}, 0), + false); + + model = make_shared(NodeVector{reshape_0, reshape_1}, ParameterVector{input}); + + manager.set_per_pass_validation(false); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + } + { + auto input = make_shared(element::f32, PartialShape({-1, -1, 4, -1})); + auto dim_0 = make_shared( + OutputVector{v0::Constant::create(element::i32, {1}, {-1}), get_dim_by_idx(input, {3, 2}, element::i32)}, + 0); + auto dim_1 = std::make_shared(dim_0, element::i64); + + auto reshape_0 = make_shared(input, dim_1, false); + auto reshape_1 = make_shared(input, dim_0, false); + + model_ref = make_shared(NodeVector{reshape_0, reshape_1}, ParameterVector{input}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/ngram_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/ngram_fusion.cpp index 3fe96815559d9d..ed1e82c8c29e49 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/ngram_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/ngram_fusion.cpp @@ -124,19 +124,32 @@ ov::intel_cpu::NgramFusion::NgramFusion() { : ov::OutputVector{wrap_type(), idces_m, wrap_type()}; auto idces_concat_m = wrap_type(idces_concat_inputs); + // Concat can be replaced by symbolic optimizations as it can find alternative source for this op. + const auto concat_shape_with_cropped_symbol_m = + [&cropped_shape_symbol](const ov::Output& output) -> bool { + const auto& symbols = output.get_tensor().get_value_symbol(); + return ov::pass::pattern::rank_equals(1)(output) && !symbols.empty() && + symbols[0] == cropped_shape_symbol; + }; + // left equal branch auto crop_left_bias_m = wrap_type(); auto crop_left_cropped_shape_m = std::make_shared( ov::OutputVector{wrap_type({cropped_shape_m, crop_left_bias_m}), cropped_shape_m}); - auto idxes_crop_left_concat_m = wrap_type({crop_left_cropped_shape_m, wrap_type()}); + auto idxes_crop_left_concat_m = std::make_shared( + ov::OutputVector{wrap_type({crop_left_cropped_shape_m, wrap_type()}), + any_input(concat_shape_with_cropped_symbol_m)}); auto idxes_crop_left_m = wrap_type({idces_concat_m, wrap_type(), idxes_crop_left_concat_m, wrap_type()}); // right equal branch auto crop_right_bias_m = wrap_type(); auto crop_right_cropped_shape_m = std::make_shared( ov::OutputVector{wrap_type({cropped_shape_m, crop_right_bias_m}), cropped_shape_m}); - auto idxes_crop_right_concat_m = wrap_type({crop_right_cropped_shape_m, wrap_type()}); - auto idxes_crop_right_m = wrap_type({idces_concat_m, wrap_type(), idxes_crop_right_concat_m, wrap_type()}); + auto idxes_crop_right_concat_m = std::make_shared( + ov::OutputVector{wrap_type({crop_right_cropped_shape_m, wrap_type()}), + any_input(concat_shape_with_cropped_symbol_m)}); + auto idxes_crop_right_m = wrap_type( + {idces_concat_m, wrap_type(), idxes_crop_right_concat_m, wrap_type()}); auto equal_m = wrap_type({idxes_crop_left_m, idxes_crop_right_m}); auto condition_m = wrap_type({equal_m, any_input()}); @@ -153,7 +166,7 @@ ov::intel_cpu::NgramFusion::NgramFusion() { auto then_m = wrap_type({padded_tokens_m, wrap_type(), then_cropped_shape_m, wrap_type()}); // else branch - auto else_target_shape_concat_m = wrap_type({cropped_shape_m, wrap_type()}); + auto else_target_shape_concat_m = any_input(concat_shape_with_cropped_symbol_m); auto else_m = wrap_type({wrap_type(), else_target_shape_concat_m, wrap_type()}); auto select_m = wrap_type { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "select::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp index 634e35e58cbc99..983987f9fed314 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp @@ -38,7 +38,7 @@ struct shape_of_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "shape_of::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); auto output_mem_ptr = instance.output_memory_ptr(); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/strided_slice.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/strided_slice.cpp index 318d9dc1d2cc1d..ac9032fe466748 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/strided_slice.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/strided_slice.cpp @@ -87,7 +87,7 @@ struct strided_slice_impl : public typed_primitive_impl { return stream.group_events(events); } - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/tile.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/tile.cpp index 18223449030e36..1f53781aedc2a3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/tile.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/tile.cpp @@ -50,7 +50,7 @@ struct tile_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "tile::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/reorder_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/reorder_impls.cpp index 3b38e2754fbc12..eab8c1f3d9c3f8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/registry/reorder_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/registry/reorder_impls.cpp @@ -36,7 +36,8 @@ const std::vector>& Registrydependencies().begin(), inst->dependencies().end(), + [&](const std::pair& dep) { + if (dep.first->is_constant() || + (dep.first->get_impl() != nullptr && dep.first->get_impl()->is_cpu())) { + return true; + } + // Check if the dependency can be optimized + if (dep.first->can_be_optimized()) { + return check_all_deps_cpu(dep.first); + } + return false; + }); +} + +bool primitive_inst::all_dependencies_cpu_impl() const { + return check_all_deps_cpu(this); +} event::ptr primitive_inst::realloc_if_needed() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id())); From 664ffc6fc7d68775a44454889dac70e2b265cb1d Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Mon, 28 Oct 2024 15:10:21 +0100 Subject: [PATCH 067/233] [STFT][Op][Ref] Extend STFT op and ref to support 1D signal input (#27274) ### Details: - Extend STFT op and ref to support 1D signal input (including CPU, as currently it uses ref impl) - Less code is needed to support such case natively than subgraph for proper Squeeze/Unsqueeze logic - For 1D signal enablement minor changes in shape_infer and ref impl have been added (all of the other updated files are tests): * src/core/reference/src/op/stft.cpp * src/core/shape_inference/include/stft_shape_inference.hpp - Xfail for 1D stft case for PT FE has been removed, the test passed --------------------------------- PR for spec update will be provided separately (in progress). ### Tickets: - 155996 --- src/core/reference/src/op/stft.cpp | 5 ++- .../include/stft_shape_inference.hpp | 39 +++++++++++------- src/core/tests/type_prop/stft.cpp | 13 ++++-- .../single_layer_tests/stft.cpp | 12 ++++++ .../stft_shape_inference_test.cpp | 20 ++++++++++ .../tests/functional/op_reference/stft.cpp | 40 ++++++++++++++++--- tests/layer_tests/pytorch_tests/test_stft.py | 2 - 7 files changed, 104 insertions(+), 27 deletions(-) diff --git a/src/core/reference/src/op/stft.cpp b/src/core/reference/src/op/stft.cpp index d3a93db0e1e937..6fd5583be21a75 100644 --- a/src/core/reference/src/op/stft.cpp +++ b/src/core/reference/src/op/stft.cpp @@ -21,8 +21,9 @@ void stft(const float* signal, const int64_t frame_size, const int64_t frame_step, const bool transpose_frames) { - constexpr size_t signal_axis = 1; - const auto batch_size = signal_shape[0]; + const auto is_signal_1D = signal_shape.size() == 1; + const size_t batch_size = is_signal_1D ? 1 : signal_shape[0]; + const size_t signal_axis = is_signal_1D ? 0 : 1; const auto signal_length = signal_shape[signal_axis]; const auto num_frames = static_cast((signal_length - frame_size) / frame_step) + 1; const auto frame_size_dim = static_cast(frame_size); diff --git a/src/core/shape_inference/include/stft_shape_inference.hpp b/src/core/shape_inference/include/stft_shape_inference.hpp index c856b4f905b7e6..41abf6640ddb96 100644 --- a/src/core/shape_inference/include/stft_shape_inference.hpp +++ b/src/core/shape_inference/include/stft_shape_inference.hpp @@ -25,10 +25,11 @@ std::vector shape_infer(const STFT* op, const auto& frame_size_shape = input_shapes[2]; const auto& frame_step_shape = input_shapes[3]; + const auto signal_shape_rank = signal_shape.rank(); NODE_SHAPE_INFER_CHECK(op, input_shapes, - signal_shape.rank().compatible(2), - "The shape of signal must be 2D [batch, signal_size]."); + signal_shape_rank.compatible(1) || signal_shape_rank.compatible(2), + "The shape of signal must be 1D [signal_size] or 2D [batch, signal_size]."); NODE_SHAPE_INFER_CHECK(op, input_shapes, window_shape.rank().compatible(1), @@ -42,29 +43,36 @@ std::vector shape_infer(const STFT* op, frame_step_shape.rank().compatible(0), "The shape of frame_step must be a scalar."); + if (signal_shape_rank.is_dynamic()) { + return {signal_shape}; + } + const auto frame_size = get_input_const_data_as(op, 2, ta); const auto frame_step = get_input_const_data_as(op, 3, ta); - if (signal_shape.rank().is_dynamic()) { - return {signal_shape}; - } else if (!frame_size || !frame_step) { - return {TRShape{signal_shape[0], TDim(ov::util::dim::inf_bound), TDim(ov::util::dim::inf_bound), 2}}; + const auto is_signal_1D = signal_shape.size() == 1; + if (!frame_size || !frame_step) { + if (is_signal_1D) { + return {TRShape{TDim(ov::util::dim::inf_bound), TDim(ov::util::dim::inf_bound), 2}}; + } else { + return {TRShape{signal_shape[0], TDim(ov::util::dim::inf_bound), TDim(ov::util::dim::inf_bound), 2}}; + } } const auto& frame_size_val = (*frame_size)[0]; const auto& frame_step_val = (*frame_step)[0]; + const TDim& signal_dim = is_signal_1D ? signal_shape[0] : signal_shape[1]; const bool is_frame_size_in_range = - 0 < frame_size_val && - (signal_shape[1].is_static() ? static_cast(frame_size_val) <= signal_shape[1].get_length() - : frame_size_val <= signal_shape[1].get_interval().get_max_val()); + 0 < frame_size_val && (signal_dim.is_static() ? static_cast(frame_size_val) <= signal_dim.get_length() + : frame_size_val <= signal_dim.get_interval().get_max_val()); NODE_SHAPE_INFER_CHECK(op, input_shapes, is_frame_size_in_range, "Provided frame size is ", frame_size_val, " but must be in range [1, ", - signal_shape[1], + signal_dim, "]."); NODE_SHAPE_INFER_CHECK(op, @@ -84,9 +92,8 @@ std::vector shape_infer(const STFT* op, frame_size_val, "]."); - const auto& batch_dim = signal_shape[0]; const TDim frame_size_dim = static_cast(frame_size_val); - const TDim signal_frame_size_diff = signal_shape[1] - frame_size_dim; + const TDim signal_frame_size_diff = signal_dim - frame_size_dim; TDim fft_samples_dim = (frame_size_val / 2) + 1; // Divsion opeartor for static Dimension of PartialShape can return non static dimension and ceil instead of floor @@ -95,9 +102,13 @@ std::vector shape_infer(const STFT* op, std::vector output_shapes; if (op->get_transpose_frames()) { - output_shapes.emplace_back(TRShape{batch_dim, std::move(fft_samples_dim), std::move(frames_dim), 2}); + output_shapes.emplace_back(TRShape{std::move(fft_samples_dim), std::move(frames_dim), 2}); } else { - output_shapes.emplace_back(TRShape{batch_dim, std::move(frames_dim), std::move(fft_samples_dim), 2}); + output_shapes.emplace_back(TRShape{std::move(frames_dim), std::move(fft_samples_dim), 2}); + } + if (!is_signal_1D) { + const auto& batch_dim = signal_shape[0]; + output_shapes[0].insert(output_shapes[0].begin(), batch_dim); } return output_shapes; } diff --git a/src/core/tests/type_prop/stft.cpp b/src/core/tests/type_prop/stft.cpp index 702b000d2dd560..2969af4e5a43bd 100644 --- a/src/core/tests/type_prop/stft.cpp +++ b/src/core/tests/type_prop/stft.cpp @@ -104,6 +104,13 @@ INSTANTIATE_TEST_SUITE_P( type_prop_stft_shape, TypePropSTFTTestP, testing::Values( + std::make_tuple(PartialShape{16}, PartialShape{16}, 16, 16, true, PartialShape{9, 1, 2}), + std::make_tuple(PartialShape{48}, PartialShape{16}, 16, 16, false, PartialShape{3, 9, 2}), + std::make_tuple(PartialShape{56}, PartialShape{7}, 11, 3, false, PartialShape{16, 6, 2}), + std::make_tuple(PartialShape{56}, PartialShape{7}, 11, 3, true, PartialShape{6, 16, 2}), + std::make_tuple(PartialShape{48}, PartialShape{8}, 16, 4, true, PartialShape{9, 9, 2}), + std::make_tuple(PartialShape{{48, 56}}, PartialShape{7}, 11, 3, true, PartialShape{6, {13, 16}, 2}), + std::make_tuple(PartialShape{-1}, PartialShape{7}, 11, 3, true, PartialShape{6, {1, -1}, 2}), std::make_tuple(PartialShape{1, 16}, PartialShape{16}, 16, 16, true, PartialShape{1, 9, 1, 2}), std::make_tuple(PartialShape{1, 48}, PartialShape{16}, 16, 16, true, PartialShape{1, 9, 3, 2}), std::make_tuple(PartialShape{1, 48}, PartialShape{16}, 16, 16, false, PartialShape{1, 3, 9, 2}), @@ -139,16 +146,16 @@ TEST_F(TypePropSTFTTest, signal_incompatible_shape) { const auto frame_size = std::make_shared(element::i64, PartialShape{}); const auto frame_step = std::make_shared(element::i64, PartialShape{}); { - const auto signal = std::make_shared(element::f32, PartialShape{48}); + const auto signal = std::make_shared(element::f32, PartialShape{}); OV_EXPECT_THROW(std::ignore = make_op(signal, window, frame_size, frame_step, transform_frames), NodeValidationFailure, - HasSubstr("The shape of signal must be 2D [batch, signal_size]")); + HasSubstr("The shape of signal must be 1D [signal_size] or 2D [batch, signal_size]")); } { const auto signal = std::make_shared(element::f32, PartialShape{-1, 4, 48}); OV_EXPECT_THROW(std::ignore = make_op(signal, window, frame_size, frame_step, transform_frames), NodeValidationFailure, - HasSubstr("The shape of signal must be 2D [batch, signal_size]")); + HasSubstr("The shape of signal must be 1D [signal_size] or 2D [batch, signal_size]")); } } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/stft.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/stft.cpp index b5fce57fb22f0c..91505d131d6aa5 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/stft.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/stft.cpp @@ -16,6 +16,12 @@ const std::vector data_type = {ov::element::f32, ov::element: const std::vector step_size_type = {ov::element::i32, ov::element::i64}; const std::vector> input_shapes = { + { // Static shapes + {{}, {{128}}}, // 1st input + {{}, {{8}}}, // 2nd input + {{}, {{}}}, // 3rd input + {{}, {{}}} // 4th input + }, { // Static shapes {{}, {{1, 128}}}, // 1st input {{}, {{8}}}, // 2nd input @@ -34,6 +40,12 @@ const std::vector> input_shapes = { {{}, {{}}}, // 3rd input {{}, {{}}} // 4th input }, + { // Dynamic dims in the first and second input shape + {{-1}, {{128}}}, // 1st input + {{-1}, {{8}}}, // 2nd input + {{}, {{}}}, // 3rd input + {{}, {{}}} // 4th input + }, { // Dynamic dims in the first and second input shape {{-1, -1}, {{1, 128}, {2, 226}}}, // 1st input {{-1}, {{8}, {16}}}, // 2nd input diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/stft_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/stft_shape_inference_test.cpp index 4c2ec30d16ee95..9d2960e31b71f2 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/stft_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/stft_shape_inference_test.cpp @@ -17,6 +17,26 @@ using testing::HasSubstr; class STFTShapeInferenceTest : public OpStaticShapeInferenceTest {}; +TEST_F(STFTShapeInferenceTest, all_input_as_params_1D_signal) { + const auto data_type = element::f32; + const auto step_size_type = element::i32; + const auto in_signal = std::make_shared(data_type, ov::PartialShape{-1}); + const auto in_window = std::make_shared(data_type, ov::PartialShape{-1}); + const auto in_frame_size = std::make_shared(step_size_type, ov::Shape{}); + const auto in_frame_step = std::make_shared(step_size_type, ov::Shape{}); + const auto op = make_op(in_signal, in_window, in_frame_size, in_frame_step, true); + + std::vector static_input_shapes = {StaticShape{48}, StaticShape{16}, StaticShape{}, StaticShape{}}; + int32_t frame_size = 16; + int32_t frame_step = 16; + + auto const_data = std::unordered_map{{2, {element::i32, Shape{}, &frame_size}}, + {3, {element::i32, Shape{}, &frame_step}}}; + auto acc = make_tensor_accessor(const_data); + auto static_output_shapes = shape_infer(op.get(), static_input_shapes, acc); + ASSERT_EQ(static_output_shapes[0], StaticShape({9, 3, 2})); +} + TEST_F(STFTShapeInferenceTest, all_input_as_params) { const auto data_type = element::f32; const auto step_size_type = element::i32; diff --git a/src/plugins/template/tests/functional/op_reference/stft.cpp b/src/plugins/template/tests/functional/op_reference/stft.cpp index 4f1153b3f91a17..d118fcc181c92f 100644 --- a/src/plugins/template/tests/functional/op_reference/stft.cpp +++ b/src/plugins/template/tests/functional/op_reference/stft.cpp @@ -93,7 +93,8 @@ std::vector generateSTFTParams() { using VT = typename ov::element_type_traits::value_type; using INT_T = typename ov::element_type_traits::value_type; - const ov::Shape signal_48_shape{1, 48}; + const ov::Shape signal_48_shape{48}; + const ov::Shape signal_1_48_shape{1, 48}; const ov::Shape signal_2_48_shape{2, 48}; const ov::Shape signal_256_shape{1, 256}; @@ -107,6 +108,16 @@ std::vector generateSTFTParams() { -2.43477, 0.11273, 0.37044, 1.35963, 0.50186, -0.84421, 0.00001, 0.54235, -0.31351, 0.77101, -1.86809, 1.73118, 1.46768, -0.33568, 0.61134, 0.04797}); + reference_tests::Tensor signal_1_48( + signal_1_48_shape, + ET, + std::vector{-0.41676, -0.05627, -2.1362, 1.64027, -1.79344, -0.84175, 0.50288, -1.24529, + -1.05795, -0.90901, 0.55145, 2.29221, 0.04154, -1.11793, 0.53906, -0.59616, + -0.01913, 1.175, -0.74787, 0.00903, -0.87811, -0.15643, 0.25657, -0.98878, + -0.33882, -0.23618, -0.63766, -1.18761, -1.42122, -0.1535, -0.26906, 2.23137, + -2.43477, 0.11273, 0.37044, 1.35963, 0.50186, -0.84421, 0.00001, 0.54235, + -0.31351, 0.77101, -1.86809, 1.73118, 1.46768, -0.33568, 0.61134, 0.04797}); + reference_tests::Tensor signal_2_48( signal_2_48_shape, ET, @@ -255,6 +266,16 @@ std::vector generateSTFTParams() { -0.05574, 1.01868, -0.7169, 0.52739, 4.39323, -0.92417, 1.39751, 0.37859, 1.30337, 0., 0.2294, 0., 0.82838, 0., -4.56982, 0., -1.47752, 0.}); + reference_tests::Tensor output_9_3_2_transp( + Shape{9, 3, 2}, + ET, + std::vector{-2.52411, 0., -3.6289, 0., 1.1366, 0., 1.99743, 2.45799, 1.84867, + -0.67991, 0.26235, 0.25725, -2.243, -1.74288, 0.39666, 0.60667, -0.73965, -0.24622, + 2.91255, -0.82545, 0.03844, 0.45931, -1.29728, -1.50822, -2.56084, 2.24181, -0.92956, + -1.32518, 1.78749, 1.94867, 0.87525, 0.70978, 0.47508, 1.29318, -0.18799, 0.98232, + 2.10241, -2.57882, 0.88504, -1.03814, -1.44897, -2.97866, -1.59965, -0.02599, -1.02171, + 0.17824, 2.46326, 1.82815, -0.44417, 0., 0.24368, 0., -2.81501, 0.}); + reference_tests::Tensor output_1_9_3_2_transp( Shape{1, 9, 3, 2}, ET, @@ -309,6 +330,13 @@ std::vector generateSTFTParams() { std::vector params; params.emplace_back(signal_48, + hann_window_16, + frame_size_16, + frame_step_16, + transpose_frames_true, + output_9_3_2_transp, + "basic_1D_transp"); + params.emplace_back(signal_1_48, hann_window_16, frame_size_16, frame_step_16, @@ -329,35 +357,35 @@ std::vector generateSTFTParams() { transpose_frames_false, output_2_3_9_2_no_transp, "basic_batch_2_no_transp"); - params.emplace_back(signal_48, + params.emplace_back(signal_1_48, hann_window_16, frame_size_16, frame_step_4, transpose_frames_true, output_1_9_9_2_transp, "step_1/4_frame_transp"); - params.emplace_back(signal_48, + params.emplace_back(signal_1_48, hann_window_8, frame_size_16, frame_step_8, transpose_frames_true, output_1_9_5_2_transp, "win_size_<_frame_size_transp"); - params.emplace_back(signal_48, + params.emplace_back(signal_1_48, hann_window_8, frame_size_16, frame_step_4, transpose_frames_true, output_1_9_9_2_transp_win_pad, "step_1/4_frame_&_win_size_<_frame_size_transp"); - params.emplace_back(signal_48, + params.emplace_back(signal_1_48, hann_window_7, frame_size_11, frame_step_3, transpose_frames_true, output_1_6_13_2_transp, "odd_sizes_transp"); - params.emplace_back(signal_48, + params.emplace_back(signal_1_48, hann_window_5, frame_size_9, frame_step_100, diff --git a/tests/layer_tests/pytorch_tests/test_stft.py b/tests/layer_tests/pytorch_tests/test_stft.py index 29d6b94efbfd37..832a624da65626 100644 --- a/tests/layer_tests/pytorch_tests/test_stft.py +++ b/tests/layer_tests/pytorch_tests/test_stft.py @@ -67,8 +67,6 @@ def forward(self, x, window): def test_stft(self, n_fft, hop_length, window_size, signal_shape, ie_device, precision, ir_version, trace_model): if ie_device == "GPU": pytest.xfail(reason="STFT op is not supported on GPU yet") - if signal_shape == (128,): - pytest.xfail(reason="STFT op is doesn't support 1D signal yet, please unsqueeze the input.") self._test(*self.create_model(n_fft, hop_length, window_size), ie_device, precision, ir_version, kwargs_to_prepare_input={"win_length": window_size, "signal_shape": signal_shape}, trace_model=trace_model) From 2486a7f84d70d50607990c9e15ab752f6c3f26c4 Mon Sep 17 00:00:00 2001 From: Andrii Staikov Date: Mon, 28 Oct 2024 15:23:49 +0100 Subject: [PATCH 068/233] [TRANSFORMATIONS] Check if FROM types are different from a node's type in ConvertPrecision (#27231) [TRANSFORMATIONS] Check if Select is 'relaxed' only if 0th input is not boolean in ConvertPrecision The ConvertPrecision transformation wraps Select node (its 0th input in particular) into RelaxedType to accept tensors with other element types than boolean. Add a check to perform the appearance of RelaxedType only if the 0th input is not boolean. Tickets: * CVS-155745 * CVS-155990 Signed-off-by: Andrii Staikov --------- Signed-off-by: Andrii Staikov --- .../src/transformations/convert_precision.cpp | 15 +++++----- .../tests/utils/convert_precision.cpp | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index 6f5166dfd26760..c34e91f835301a 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -254,7 +254,6 @@ bool convert_function_precision(const std::shared_ptr& f, // Register internal constants only after fixing input type that could lead to nodes // replacement register_constants(ops); - for (auto& node : ops) { // skip precision sensitive nodes if (skip_precision_sensitive && fp16_compression_is_disabled(node) && has_fp16_compression) @@ -1000,12 +999,14 @@ bool extend_select_type(const std::shared_ptr& node, const precisions_ type_relaxed->set_origin_input_type(ov::element::boolean, 0); return true; } else if (auto casted = ov::as_type_ptr(node)) { - auto relaxed_op = - std::make_shared>(*casted, - ov::element::TypeVector{ov::element::boolean}, - ov::element::TypeVector{}); - replace_node(node, relaxed_op); - return true; + if (precisions.count(ov::element::boolean) != 0) { + auto relaxed_op = + std::make_shared>(*casted, + ov::element::TypeVector{ov::element::boolean}, + ov::element::TypeVector{}); + replace_node(node, relaxed_op); + return true; + } } return false; } diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp index 03095b28d96a74..9554cf09162d45 100644 --- a/src/common/transformations/tests/utils/convert_precision.cpp +++ b/src/common/transformations/tests/utils/convert_precision.cpp @@ -963,6 +963,34 @@ TEST(TransformationTests, ConvertPrecision_Select) { ASSERT_TRUE(has_type(f)); } +TEST(TransformationTests, ConvertPrecision_Select_Relaxed) { + std::shared_ptr f(nullptr); + { + auto input1 = std::make_shared(element::boolean, Shape{15, 20, 3}); + auto node = std::make_shared(input1); + auto select = std::make_shared(node, input1, input1); + + f = std::make_shared(OutputVector{select}, ParameterVector{input1}); + + // Explicitly setting the element type of a node to a different one to + // test the appearance of TypeRelaxed within Select + node->set_output_type(0, ov::element::u8, node->get_output_partial_shape(0)); + + pass::Manager manager; + manager.register_pass(); + manager.register_pass(precisions_map{{element::u8, element::boolean}}); + manager.run_passes(f); + } + OV_ASSERT_NO_THROW(check_rt_info(f)); + ASSERT_FALSE(has_type(f)); + ASSERT_TRUE(has_type(f)); + int counter = 0; + for (const auto& node : f->get_ordered_ops()) + if (std::dynamic_pointer_cast(node)) + ++counter; + ASSERT_EQ(counter, 1); +} + TEST(TransformationTests, ConvertPrecision_TypeRelaxedWithSelect) { std::shared_ptr f(nullptr); { From 0d113d9cf0b89529307aa1c5994e13cbe885872f Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Mon, 28 Oct 2024 18:46:48 +0400 Subject: [PATCH 069/233] [GPU] KV-cache compression support (#27114) ### Details: This PR enables KV-cache compression support Currently, it supports only combinations of the following configurations: * Data types: INT8_SYM / INT8_ASYM * Modes: per-token (quantization of `num_heads * head_size` in a single group) / per-token-per-head (quantization of each `head_size` group for each head per token) ### Tickets: - *ticket-id* --- .../include/ov_ops/dynamic_quantize.hpp | 68 +++- .../src/ov_ops/dynamic_quantize.cpp | 91 ++++- .../intel_gpu/graph/kernel_impl_params.hpp | 2 +- .../include/intel_gpu/op/indirect_sdpa.hpp | 11 + .../include/intel_gpu/op/kv_cache.hpp | 17 +- .../intel_gpu/op/kv_cache_compressed.hpp | 56 +++ .../include/intel_gpu/op/read_value.hpp | 7 + .../include/intel_gpu/op/read_values.hpp | 42 ++ .../intel_gpu/include/intel_gpu/op/sdpa.hpp | 19 + .../plugin/multi_tensor_variable_state.hpp | 26 ++ .../intel_gpu/plugin/primitives_list.hpp | 2 + .../intel_gpu/primitives/dynamic_quantize.hpp | 57 ++- .../include/intel_gpu/primitives/kv_cache.hpp | 64 ++- .../intel_gpu/primitives/read_value.hpp | 27 +- .../scaled_dot_product_attention.hpp | 77 +++- .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../include/intel_gpu/runtime/layout.hpp | 15 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 33 +- .../graph_optimizer/prepare_buffer_fusing.cpp | 38 +- .../src/graph/impls/cpu/read_value.cpp | 18 +- .../src/graph/impls/ocl/dynamic_quantize.cpp | 11 +- .../src/graph/impls/ocl/kv_cache.cpp | 213 +++++++++- .../ocl/scaled_dot_product_attention.cpp | 61 ++- .../src/graph/include/dynamic_quantize_inst.h | 3 +- .../src/graph/include/kv_cache_inst.h | 6 + .../src/graph/include/program_node.h | 2 +- .../src/graph/include/read_value_inst.h | 9 +- src/plugins/intel_gpu/src/graph/kv_cache.cpp | 40 +- .../intel_gpu/src/graph/primitive_inst.cpp | 232 +++++++++-- .../intel_gpu/src/graph/program_node.cpp | 3 +- .../intel_gpu/src/graph/read_value.cpp | 27 +- .../graph/scaled_dot_product_attention.cpp | 9 + .../dynamic_quantize_gpu_kv_cache.cl | 121 ++++++ .../cl_kernels/dynamic_quantize_gpu_ref.cl | 129 +++++- .../kernel_selector/cl_kernels/sdpa_opt.cl | 381 +++++++++++++++--- .../kernel_selector/cl_kernels/sdpa_ref.cl | 55 ++- .../dynamic_quantize_kernel_kv_cache.h | 30 ++ .../dynamic_quantize_kernel_opt.cpp | 9 + .../dynamic_quantize_kernel_opt_kv_cache.cpp | 285 +++++++++++++ .../dynamic_quantize_kernel_ref.cpp | 46 ++- .../dynamic_quantize_kernel_ref.h | 7 + .../dynamic_quantize_kernel_selector.cpp | 2 + .../kernels/sdpa/sdpa_kernel_base.cpp | 18 + .../kernels/sdpa/sdpa_kernel_base.h | 8 + .../kernels/sdpa/sdpa_kernel_micro.cpp | 3 + .../kernels/sdpa/sdpa_kernel_opt.cpp | 33 +- .../kernels/sdpa/sdpa_kernel_ref.cpp | 23 +- .../plugin/multi_tensor_variable_state.cpp | 59 +++ .../src/plugin/ops/dynamic_quantize.cpp | 14 +- .../intel_gpu/src/plugin/ops/kv_cache.cpp | 33 +- .../ops/scaled_dot_product_attention.cpp | 8 +- .../intel_gpu/src/plugin/ops/variable.cpp | 26 +- .../src/plugin/sync_infer_request.cpp | 28 +- .../dynamic_quantize_fully_connected.cpp | 9 +- .../transformations/kv_cache_compression.cpp | 292 ++++++++++++++ .../transformations/kv_cache_compression.hpp | 43 ++ .../transformations/op/indirect_sdpa.cpp | 55 ++- .../plugin/transformations/op/kv_cache.cpp | 128 +++++- .../plugin/transformations/op/read_value.cpp | 83 +++- .../src/plugin/transformations/op/sdpa.cpp | 43 +- .../src/plugin/transformations_pipeline.cpp | 5 + .../src/runtime/debug_configuration.cpp | 3 + .../src/runtime/execution_config.cpp | 19 + .../subgraph_tests/dynamic/kv_cache_sdpa.cpp | 37 +- .../unit/dynamic_execution/stateful_model.cpp | 4 +- .../unit/shape_infer/read_value_si_test.cpp | 2 +- .../test_cases/dynamic_quantize_gpu_test.cpp | 108 +++-- .../tests/unit/test_cases/variable.cpp | 10 +- .../transformations/kv_cache_compression.cpp | 344 ++++++++++++++++ 69 files changed, 3447 insertions(+), 343 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/kv_cache_compression.cpp diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 69c148305fb94f..2eb79322b84e28 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -14,29 +14,75 @@ namespace internal { /// \brief Operator performing Dynamic Quantize class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { public: - OPENVINO_OP("DynamicQuantize", "gpu_opset"); + OPENVINO_OP("DynamicQuantize", "ie_internal_opset"); + + /** + * @brief Configuration for the type of quantization applied to the data: + * - Symmetric: Quantization where the zero point is fixed at zero, and the range is symmetric around zero. + * - Asymmetric: Quantization where the zero point is not fixed at zero. + */ + enum class QuantizationType { Symmetric, Asymmetric }; + + /** + * @brief Configuration for how Activations, Scales and Zero Points will be stored in output buffers: + * - Planar: Activations, Scales, and Zero Points are stored in independent buffers. + * - InterleavedScalesZP: Activations are stored in an independent buffer, while Scales and Zero Points (if any) are + * combined in a separate buffer. + */ + enum class OutputStorageType { Planar, InterleavedScalesZP, /* InterleavedActivationsScalesZP */ }; + + /// \brief Structure that specifies attributes for interpolation + struct Attributes { + QuantizationType quantization_type = QuantizationType::Symmetric; + element::Type quantization_dt = element::undefined; + element::Type scale_dt = element::undefined; + element::Type zp_dt = element::undefined; + + std::vector group_sizes = {}; + std::vector scales_zp_output_order = {}; + OutputStorageType output_storage_type = OutputStorageType::Planar; + }; DynamicQuantize() = default; /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - /// \param group_sizes Group sizes for dynamic quantization - /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + /// \param config Dynamic quantization configuration + DynamicQuantize(const Output& data, const Attributes& attrs); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + const Attributes& get_attrs() const { + return m_attrs; + } + + void set_attrs(Attributes attrs) { + m_attrs = std::move(attrs); + } + const std::vector& get_group_sizes() const { - return m_group_sizes; - }; + return m_attrs.group_sizes; + } + + QuantizationType get_quantization_type() const { + return m_attrs.quantization_type; + } + + OutputStorageType get_output_storage_type() const { + return m_attrs.output_storage_type; + } + + const std::vector& get_scales_zp_output_order() const { + return m_attrs.scales_zp_output_order; + } + static std::vector shape_infer(const DynamicQuantize* op, - const std::vector& input_shapes, - const std::vector& group_sizes); + const std::vector& input_shapes); -private: - std::vector m_group_sizes; - element::Type m_dt_scale; +protected: + Attributes m_attrs; }; } // namespace internal diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 74c0498e9a4425..9d1dfa5e5e3f62 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -7,62 +7,113 @@ #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/op/variadic_split.hpp" -#include "variadic_split_shape_inference.hpp" +#include "openvino/util/common_util.hpp" namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) - : Op({data}), - m_group_sizes(std::move(group_sizes)), - m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), - "FC input rank should be same as the rank of group_size ", +DynamicQuantize::DynamicQuantize(const Output& data, const Attributes& attrs) : Op({data}), m_attrs(attrs) { + if (m_attrs.scales_zp_output_order.empty()) { + m_attrs.scales_zp_output_order.resize(data.get_partial_shape().size()); + std::iota(m_attrs.scales_zp_output_order.begin(), m_attrs.scales_zp_output_order.end(), 0); + } + + OPENVINO_ASSERT(data.get_partial_shape().rank() == m_attrs.group_sizes.size(), + "DQ input rank should be same as the rank of group_size ", data.get_tensor_ptr()->get_partial_shape().rank(), " / ", - m_group_sizes.size()); - set_output_size(2); + m_attrs.group_sizes.size()); + + OPENVINO_ASSERT(data.get_partial_shape().size() == m_attrs.scales_zp_output_order.size(), + "DQ input rank should be same as the rank of scales and zero points output order)"); + + size_t outputs_number = 2; + if (m_attrs.quantization_type == QuantizationType::Asymmetric && + m_attrs.output_storage_type == OutputStorageType::Planar) + outputs_number = 3; + + OPENVINO_ASSERT( + (m_attrs.output_storage_type == OutputStorageType::Planar) || + (m_attrs.quantization_type == QuantizationType::Asymmetric && m_attrs.scale_dt == m_attrs.zp_dt), + "Scales and Zero Points should have the same data type to be stored in the single buffer"); + + set_output_size(outputs_number); validate_and_infer_types(); } void DynamicQuantize::validate_and_infer_types() { std::vector input_shapes = {get_input_partial_shape(0)}; - auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); - set_output_type(0, element::i8, out_shapes[0]); - set_output_type(1, m_dt_scale, out_shapes[1]); + auto out_shapes = shape_infer(this, input_shapes); + set_output_type(0, m_attrs.quantization_dt, out_shapes[0]); + set_output_type(1, m_attrs.scale_dt, out_shapes[1]); + + if (m_attrs.quantization_type == QuantizationType::Asymmetric && + m_attrs.output_storage_type == OutputStorageType::Planar) + set_output_type(2, m_attrs.zp_dt, out_shapes[2]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); + return std::make_shared(new_args.at(0), m_attrs); } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, - const std::vector& input_shapes, - const std::vector& group_sizes) { + const std::vector& input_shapes) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); auto scale_shape = input_shapes[0]; + const auto& group_sizes = op->m_attrs.group_sizes; OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(), "Scale_shape and group_size are supposed to have same rank: ", scale_shape.size(), " / ", group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { - if (scale_shape[i].is_dynamic()) + if (scale_shape[i].is_dynamic() || scale_shape[i] == 0) continue; - if (group_sizes[i] == UINT64_MAX) + if (group_sizes[i] == UINT64_MAX) { scale_shape[i] = 1; - else { - scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 - scale_shape[i] = std::max(static_cast(scale_shape[i].get_length()), 1); + } else { + scale_shape[i] = ov::util::ceil_div(scale_shape[i].get_length(), static_cast(group_sizes[i])); } } out_shapes.push_back(scale_shape); + + // Add zero points shape, same as the scales + if (op->m_attrs.quantization_type == QuantizationType::Asymmetric && + op->m_attrs.output_storage_type == OutputStorageType::Planar) + out_shapes.push_back(scale_shape); + + auto transpose_shape = [](const ov::PartialShape& shape, const std::vector& scales_zp_output_order) { + auto transposed_shape = shape; + for (size_t i = 0; i < scales_zp_output_order.size(); i++) { + OPENVINO_ASSERT(scales_zp_output_order[i] < transposed_shape.size()); + transposed_shape[i] = shape[scales_zp_output_order[i]]; + } + + return transposed_shape; + }; + + // Transpose scales and zero points shapes + const auto& scales_zp_output_order = op->m_attrs.scales_zp_output_order; + for (size_t i = 1; i < out_shapes.size(); i++) { + out_shapes[i] = transpose_shape(out_shapes[i], scales_zp_output_order); + } + + if (op->m_attrs.quantization_type == QuantizationType::Asymmetric && + op->m_attrs.output_storage_type != OutputStorageType::Planar) { + // Currently scales and zero points are supposed to be combined over the last dimension only + const auto combine_axis = scales_zp_output_order.empty() ? out_shapes[1].size() - 1 + : scales_zp_output_order[out_shapes[1].size() - 1]; + OPENVINO_ASSERT(group_sizes[combine_axis] != 1); + + out_shapes[1][combine_axis] *= 2; // [scale, zero_point] pairs + } + return out_shapes; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 3e8887fbb2f7ee..72623f6d120955 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -53,7 +53,7 @@ struct kernel_impl_params final { optional_layout weights_zero_points_layout = optional_layout(); optional_layout activations_zero_points_layout = optional_layout(); optional_layout compensation_layout = optional_layout(); - optional_layout state_layout = optional_layout(); + std::vector state_layouts; std::map memory_deps = {}; size_t primary_input_idx = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp index b4d34a3975af6b..7c45c93c7e74f1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp @@ -29,6 +29,17 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationAttribute& quantization_attribute, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 402ff6e46c1607..7048d5229f25db 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -7,6 +7,7 @@ #include "openvino/op/op.hpp" #include "openvino/op/util/variable.hpp" #include "openvino/op/util/variable_extension.hpp" +#include "ov_ops/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -22,16 +23,16 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { KVCache(const Output& past, const Output& new_token_data, - const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, - int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); KVCache(const Output& past, const Output& new_token_data, + const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, + int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); bool visit_attributes(ov::AttributeVisitor& visitor) override; @@ -53,14 +54,22 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { bool get_indirect() const { return m_indirect; } -private: +protected: + KVCache(const OutputVector& inputs, + const std::shared_ptr& past_values, + bool indirect, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type = ov::element::undefined); + int64_t m_concat_axis = 0; int64_t m_gather_axis = 0; bool m_indirect = false; + ov::element::Type m_output_type; }; -std::vector shape_infer(const KVCache* op, std::vector input_shapes); +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes); } // namespace op } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp new file mode 100644 index 00000000000000..4ee8cb388b61ea --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/op/kv_cache.hpp" +#include "ov_ops/dynamic_quantize.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +/// \brief Operator that implements Key-Values cache subgraph for large language models. +/// This operation updates data of the corresponding Variable +class KVCacheCompressed : public ov::intel_gpu::op::KVCache { +public: + OPENVINO_OP("KVCacheCompressed", "gpu_opset"); + + using QuantizationAttrs = ov::op::internal::DynamicQuantize::Attributes; + + KVCacheCompressed() = default; + + KVCacheCompressed(const OutputVector& inputs, + const std::shared_ptr& past_values, + int64_t concat_axis, + int64_t gather_axis, + const QuantizationAttrs& quantization_attrs, + const ov::element::Type output_type = ov::element::undefined); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool get_kv_compressed() const { return m_compressed; } + bool get_combine_scales_and_zp() const { + return m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + m_quantization_attrs.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + } + + QuantizationAttrs get_quantization_attrs() const { return m_quantization_attrs; } + void set_quantization_attrs(QuantizationAttrs attrs) { m_quantization_attrs = std::move(attrs); } + + std::vector get_scales_zp_output_order() const { return m_quantization_attrs.scales_zp_output_order; } + +private: + bool m_compressed; + QuantizationAttrs m_quantization_attrs = {}; +}; + +std::vector shape_infer(const KVCacheCompressed* op, + const std::vector& input_shapes); + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp index a9c47f3fa39fa6..419c18118229ff 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp @@ -26,6 +26,7 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { bool visit_attributes(ov::AttributeVisitor& visitor) override; void validate_and_infer_types() override; + void validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info); std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; @@ -33,6 +34,12 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable"); return m_variable->get_info().variable_id; } + +protected: + ReadValue(const std::vector>& variable_initializers, const std::shared_ptr& variable) + : Op(variable_initializers) { + m_variable = variable; + } }; } // namespace op diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp new file mode 100644 index 00000000000000..ecbda9364d7b3b --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/op/read_value.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +/// \brief This operation handles the OpenVINO GPU Plugin's custom variable +// representation (which can store multiple states in a single variable) at the graph level. +class ReadValues : public ReadValue { +public: + OPENVINO_OP("ReadValues", "gpu_opset"); + + ReadValues() = default; + + ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + ReadValues(const OutputVector& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::vector get_all_internal_states_info() const; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + +private: + std::vector m_internal_states_infos; +}; + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp index 9f5d4dad16efd7..f7bc0d780ffd38 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -8,6 +8,7 @@ #include "openvino/core/partial_shape.hpp" #include "openvino/op/op.hpp" #include "openvino/op/scaled_dot_product_attention.hpp" +#include "ov_ops/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -17,6 +18,8 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { public: OPENVINO_OP("SDPA", "gpu_opset"); + using QuantizationAttribute = ov::op::internal::DynamicQuantize::Attributes; + SDPA() = default; SDPA(const OutputVector& inputs, @@ -27,6 +30,15 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationAttribute& quantization_attrs, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; @@ -41,6 +53,10 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector get_output_transpose_order() const { return m_order_out; } ov::element::Type get_output_type() const { return m_output_type; } + bool get_kv_compressed() const { return m_compressed; } + QuantizationAttribute get_quantization_attrs() const { return m_quantization_attrs; } + size_t get_compression_inputs_num() const; + static std::vector default_order(size_t rank) { std::vector order(rank); std::iota(order.begin(), order.end(), 0); @@ -54,6 +70,9 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector m_order_v; std::vector m_order_out; ov::element::Type m_output_type; + + bool m_compressed = false; + QuantizationAttribute m_quantization_attrs = {}; }; std::vector shape_infer(const SDPA* op, diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp index 0cad36f62e47b9..26d15d733102ad 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -46,5 +46,31 @@ class VariableStateIndirectKVCache : public MultiTensorState { size_t m_concat_axis = 0; }; +// This is multi-tensor state for Indirect KV-Cache + Gemm pattern +// Internally it stores KV Cache state + Beam Table state + compression scales state + (optional compression zero points) +class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCache { +public: + VariableStateIndirectKVCacheCompressed(const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + const std::vector& output_layouts, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state); + using Ptr = std::shared_ptr; + + void set_state(const ov::SoPtr& state) override; + ov::SoPtr get_state() const override; + + VariableState::Ptr get_compression_scale_state() const; + void set_compression_scale_layout(const cldnn::layout& new_layout); + + VariableState::Ptr get_compression_zp_state() const; + void set_compression_zp_layout(const cldnn::layout& new_layout); + bool has_zp_state() const; + +private: + bool m_has_zp_state = false; +}; } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 1dbd769444b1aa..27e5540a3786ab 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -283,7 +283,9 @@ REGISTER_FACTORY(internal, FullyConnectedCompressed); REGISTER_FACTORY(internal, RMS); REGISTER_FACTORY(internal, GatherCompressed); REGISTER_FACTORY(internal, KVCache); +REGISTER_FACTORY(internal, KVCacheCompressed); REGISTER_FACTORY(internal, ReadValue); +REGISTER_FACTORY(internal, ReadValues); REGISTER_FACTORY(internal, Gemm); REGISTER_FACTORY(internal, SwiGLU); REGISTER_FACTORY(internal, IndirectGemm); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index d93e2f86eed144..79af223e32cdaa 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -5,6 +5,8 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { /// @brief Dynamic Quantize primitive @@ -12,26 +14,39 @@ namespace cldnn { struct dynamic_quantize : public primitive_base { CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); - dynamic_quantize() : primitive_base("", {}), group_size(0) {} + using Attributes = ov::op::internal::DynamicQuantize::Attributes; + + dynamic_quantize() : primitive_base("", {}) {} /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id - /// @param group_size Quantization group size + /// @param group_sizes Quantization group size /// @param data_type Output data type of quantized /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const uint64_t group_size, - const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) - : primitive_base(id, {input}, 2, data_types), - group_size(group_size) {} + const Attributes& attrs) + : primitive_base(id, {input}) + , attrs(attrs) { + num_outputs = 2; + if (attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + num_outputs++; + } - uint64_t group_size = 0; + Attributes attrs; size_t hash() const override { size_t seed = primitive::hash(); - seed = hash_combine(seed, group_size); + seed = hash_range(seed, attrs.scales_zp_output_order.begin(), attrs.scales_zp_output_order.end()); + seed = hash_range(seed, attrs.group_sizes.begin(), attrs.group_sizes.end()); + seed = hash_combine(seed, attrs.quantization_type); + seed = hash_combine(seed, attrs.quantization_dt.hash()); + seed = hash_combine(seed, attrs.scale_dt.hash()); + seed = hash_combine(seed, attrs.zp_dt.hash()); + seed = hash_combine(seed, attrs.output_storage_type); + return seed; } @@ -41,17 +56,37 @@ struct dynamic_quantize : public primitive_base { auto rhs_casted = downcast(rhs); - return group_size == rhs_casted.group_size; + return attrs.scales_zp_output_order == rhs_casted.attrs.scales_zp_output_order && + attrs.output_storage_type == rhs_casted.attrs.output_storage_type && + attrs.group_sizes == rhs_casted.attrs.group_sizes && + attrs.quantization_dt == rhs_casted.attrs.quantization_dt && + attrs.scale_dt == rhs_casted.attrs.scale_dt && + attrs.zp_dt == rhs_casted.attrs.zp_dt && + attrs.quantization_type == rhs_casted.attrs.quantization_type;; } void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); - ob << group_size; + + ob << make_data(&attrs.quantization_type, sizeof(attrs.quantization_type)); + ob << make_data(&attrs.quantization_dt, sizeof(attrs.quantization_dt)); + ob << make_data(&attrs.scale_dt, sizeof(attrs.scale_dt)); + ob << make_data(&attrs.zp_dt, sizeof(attrs.zp_dt)); + ob << make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type)); + ob << attrs.scales_zp_output_order; + ob << attrs.group_sizes; } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); - ib >> group_size; + + ib >> make_data(&attrs.quantization_type, sizeof(attrs.quantization_type)); + ib >> make_data(&attrs.quantization_dt, sizeof(attrs.quantization_dt)); + ib >> make_data(&attrs.scale_dt, sizeof(attrs.scale_dt)); + ib >> make_data(&attrs.zp_dt, sizeof(attrs.zp_dt)); + ib >> make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type)); + ib >> attrs.scales_zp_output_order; + ib >> attrs.group_sizes; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index f87041dcff66d6..1c8f095752aca2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -3,10 +3,14 @@ // #pragma once + +#include "primitive.hpp" + #include "openvino/core/partial_shape.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/util/variable.hpp" -#include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + #include namespace cldnn { @@ -14,6 +18,8 @@ namespace cldnn { struct kv_cache : public primitive_base { CLDNN_DECLARE_PRIMITIVE(kv_cache) + using QuantizationAttributes = ov::op::internal::DynamicQuantize::Attributes; + kv_cache() : primitive_base("", {}) {} kv_cache(const primitive_id& id, @@ -33,11 +39,23 @@ struct kv_cache : public primitive_base { int64_t gather_axis = 0; bool indirect = false; + bool compressed = false; + QuantizationAttributes quantization_attributes; + size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, concat_axis); seed = hash_combine(seed, gather_axis); seed = hash_combine(seed, indirect); + seed = hash_combine(seed, compressed); + seed = hash_range(seed, quantization_attributes.scales_zp_output_order.begin(), quantization_attributes.scales_zp_output_order.end()); + seed = hash_range(seed, quantization_attributes.group_sizes.begin(), quantization_attributes.group_sizes.end()); + seed = hash_combine(seed, quantization_attributes.quantization_type); + seed = hash_combine(seed, quantization_attributes.quantization_dt.hash()); + seed = hash_combine(seed, quantization_attributes.scale_dt.hash()); + seed = hash_combine(seed, quantization_attributes.zp_dt.hash()); + seed = hash_combine(seed, quantization_attributes.output_storage_type);; + return seed; } @@ -50,7 +68,15 @@ struct kv_cache : public primitive_base { return variable_info == rhs_casted.variable_info && concat_axis == rhs_casted.concat_axis && gather_axis == rhs_casted.gather_axis && - indirect == rhs_casted.indirect; + indirect == rhs_casted.indirect && + compressed == rhs_casted.compressed && + quantization_attributes.scales_zp_output_order == rhs_casted.quantization_attributes.scales_zp_output_order && + quantization_attributes.output_storage_type == rhs_casted.quantization_attributes.output_storage_type && + quantization_attributes.group_sizes == rhs_casted.quantization_attributes.group_sizes && + quantization_attributes.quantization_dt == rhs_casted.quantization_attributes.quantization_dt && + quantization_attributes.scale_dt == rhs_casted.quantization_attributes.scale_dt && + quantization_attributes.zp_dt == rhs_casted.quantization_attributes.zp_dt && + quantization_attributes.quantization_type == rhs_casted.quantization_attributes.quantization_type; } void save(BinaryOutputBuffer& ob) const override { @@ -62,6 +88,14 @@ struct kv_cache : public primitive_base { ob << concat_axis; ob << gather_axis; ob << indirect; + ob << compressed; + ob << make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type)); + ob << make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt)); + ob << make_data(&quantization_attributes.scale_dt, sizeof(quantization_attributes.scale_dt)); + ob << make_data(&quantization_attributes.zp_dt, sizeof(quantization_attributes.zp_dt)); + ob << make_data(&quantization_attributes.output_storage_type, sizeof(quantization_attributes.output_storage_type)); + ob << quantization_attributes.scales_zp_output_order; + ob << quantization_attributes.group_sizes; } void load(BinaryInputBuffer& ib) override { @@ -76,6 +110,32 @@ struct kv_cache : public primitive_base { ib >> concat_axis; ib >> gather_axis; ib >> indirect; + ib >> compressed; + ib >> make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type)); + ib >> make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt)); + ib >> make_data(&quantization_attributes.scale_dt, sizeof(quantization_attributes.scale_dt)); + ib >> make_data(&quantization_attributes.zp_dt, sizeof(quantization_attributes.zp_dt)); + ib >> make_data(&quantization_attributes.output_storage_type, sizeof(quantization_attributes.output_storage_type)); + ib >> quantization_attributes.scales_zp_output_order; + ib >> quantization_attributes.group_sizes; + } + + size_t get_compression_scales_inputs_num() const { + if (compressed) { + return 1; + } else { + return 0; + } + } + + size_t get_compression_zp_inputs_num() const { + if (compressed && + quantization_attributes.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attributes.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + return 1; + } else { + return 0; + } } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp index 7d9e919f56cf13..26465692ef6352 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp @@ -22,19 +22,23 @@ struct read_value : public primitive_base { /// @param id This primitive id /// @param inputs Input parameters ids /// @param variable_id Variable id - /// @param output_layout Memory layout + /// @param output_layouts Memory layouts read_value(const primitive_id& id, const std::vector& inputs, const std::string& variable_id, - const layout& output_layout, + const std::vector& output_layouts, const ov::element::Type& user_specified_type = ov::element::undefined) - : primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}), + : primitive_base(id, inputs, output_layouts.size()), variable_id{variable_id}, - output_layout{output_layout}, - user_specified_type(user_specified_type) {} + output_layouts{output_layouts}, + user_specified_type(user_specified_type) { + for (size_t output_idx = 0; output_idx < output_layouts.size(); output_idx++) { + output_data_types[output_idx] = optional_data_type(output_layouts[output_idx].data_type); + } + } std::string variable_id; - layout output_layout; + std::vector output_layouts; ov::element::Type user_specified_type; bool operator==(const primitive& rhs) const override { @@ -51,7 +55,9 @@ struct read_value : public primitive_base { primitive_base::save(ob); ov::element::Type_t data_type = user_specified_type; ob << variable_id; - ob << output_layout; + ob << output_layouts.size(); + for (const auto& layout : output_layouts) + ob << layout; ob << make_data(&data_type, sizeof(ov::element::Type_t)); } @@ -59,7 +65,12 @@ struct read_value : public primitive_base { primitive_base::load(ib); ov::element::Type_t data_type = ov::element::Type_t::undefined; ib >> variable_id; - ib >> output_layout; + size_t output_layouts_size; + ib >> output_layouts_size; + output_layouts.resize(output_layouts_size); + for (size_t i = 0; i < output_layouts_size; i++) { + ib >> output_layouts[i]; + } ib >> make_data(&data_type, sizeof(ov::element::Type_t)); user_specified_type = data_type; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index d66012bfac8889..1fd5b43824d0a7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -5,16 +5,20 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { struct scaled_dot_product_attention : public primitive_base { CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention) + using QuantizationAttributes = ov::op::internal::DynamicQuantize::Attributes; + scaled_dot_product_attention() : primitive_base("", {}) {} /// @brief Constructs scaled_dot_product_attention primitive. /// @param id This primitive id. - /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]). + /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale], [keys scales], [keys zp], [values scales], [values zp]). /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored. scaled_dot_product_attention(const primitive_id& id, const std::vector inputs, @@ -23,18 +27,29 @@ struct scaled_dot_product_attention : public primitive_base& input_q_transpose_order = {}, const std::vector& input_k_transpose_order = {}, const std::vector& input_v_transpose_order = {}, - const std::vector& output_transpose_order = {}) + const std::vector& output_transpose_order = {}, + const QuantizationAttributes& quantization_attributes = {}, + bool is_kv_compressed = false) : primitive_base(id, inputs) , is_causal(is_causal) , indirect_axis(indirect_axis) + , is_kv_compressed(is_kv_compressed) + , quantization_attributes(quantization_attributes) , input_q_transpose_order(input_q_transpose_order) , input_k_transpose_order(input_k_transpose_order) , input_v_transpose_order(input_v_transpose_order) , output_transpose_order(output_transpose_order) { auto data_inputs_num = inputs.size(); - if (indirect_axis != -1) + if (indirect_axis != -1) { data_inputs_num--; + } + if (is_kv_compressed) { + data_inputs_num -= 2; // scales + if (quantization_attributes.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attributes.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + data_inputs_num -= 2; // zp + } has_attn_mask_input = data_inputs_num > 3; has_scale_input = data_inputs_num > 4; } @@ -44,6 +59,9 @@ struct scaled_dot_product_attention : public primitive_base input_q_transpose_order; std::vector input_k_transpose_order; std::vector input_v_transpose_order; @@ -59,6 +77,15 @@ struct scaled_dot_product_attention : public primitive_base::load(ib); ib >> is_causal; + ib >> is_kv_compressed; ib >> has_attn_mask_input; ib >> has_scale_input; ib >> indirect_axis; @@ -100,6 +144,31 @@ struct scaled_dot_product_attention : public primitive_base> input_k_transpose_order; ib >> input_v_transpose_order; ib >> output_transpose_order; + ib >> make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type)); + ib >> make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt)); + ib >> make_data(&quantization_attributes.scale_dt, sizeof(quantization_attributes.scale_dt)); + ib >> make_data(&quantization_attributes.zp_dt, sizeof(quantization_attributes.zp_dt)); + ib >> make_data(&quantization_attributes.output_storage_type, sizeof(quantization_attributes.output_storage_type)); + ib >> quantization_attributes.scales_zp_output_order; + ib >> quantization_attributes.group_sizes; + } + + size_t get_compression_scales_inputs_num() const { + if (is_kv_compressed) { + return 2; + } else { + return 0; + } + } + + size_t get_compression_zp_inputs_num() const { + if (is_kv_compressed && + quantization_attributes.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attributes.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + return 2; + } else { + return 0; + } } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index c65aa3e5894cb8..465ed898ecb7ec 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -143,6 +143,7 @@ class debug_configuration { int disable_primitive_fusing; // Disable primitive fusing int disable_fake_alignment; // Disable fake alignment std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization + int use_kv_cache_compression; // Enable KV-cache compression int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size int disable_horizontal_fc_fusion; // Disable fc horizontal fusion std::set dump_iteration; // Dump n-th execution of network. diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp index 62e4c08a90f004..ab5cb53454b768 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp @@ -454,6 +454,21 @@ inline ::std::ostream& operator<<(::std::ostream& os, const layout& p) { return os << p.to_string(); } +inline ::std::ostream& operator<<(::std::ostream& os, const std::vector& layouts) { + std::stringstream ss; + + ss << "["; + for (size_t i = 0; i < layouts.size(); i++) { + ss << layouts[i].to_short_string(); + + if (i + 1 != layouts.size()) + ss << ", "; + } + ss << "]"; + + return os << ss.str(); +} + using optional_data_type = optional_value; using optional_layout = optional_value; diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5c945f4c2d389c..8e4957d5f52797 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,29 +22,39 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::Attributes& attrs) { ov::op::internal::DynamicQuantize op; + op.set_attrs(attrs); + auto output_format = act_layout.format; std::vector input_shapes = { act_layout.get(), }; - std::vector shape_group_size(act_layout.get().size(), 1); - shape_group_size.back() = group_size; + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes); + + std::vector output_layouts = { layout(output_shapes[0], attrs.quantization_dt, output_format), + layout(output_shapes[1], attrs.scale_dt, output_format) }; - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); + if (attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + output_layouts.emplace_back(layout(output_shapes[2], attrs.zp_dt, output_format)); + } - return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; + return output_layouts; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::Attributes& config); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); + + return __calc_output_layouts(input_layout, desc->attrs); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, @@ -56,6 +66,15 @@ std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) std::stringstream primitive_description; + json_composite dynamic_quantize_info; + dynamic_quantize_info.add("output_storage_type", static_cast(desc->attrs.output_storage_type)); + dynamic_quantize_info.add("scales_zp_output_order", desc->attrs.scales_zp_output_order); + dynamic_quantize_info.add("group_sizes", desc->attrs.group_sizes); + dynamic_quantize_info.add("quantization_dt", desc->attrs.quantization_dt); + dynamic_quantize_info.add("scale_dt", desc->attrs.scale_dt); + dynamic_quantize_info.add("zp_dt", desc->attrs.zp_dt); + dynamic_quantize_info.add("quantization_type", static_cast(desc->attrs.quantization_type)); + node_info->add("dynamic_quantize info", dynamic_quantize_info); node_info->dump(primitive_description); return primitive_description.str(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 7bdbc53ad54d16..6d7d609d232947 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -423,10 +423,11 @@ bool crop_in_place_optimization::can_crop_be_optimized_simple_data_format(const } static bool can_read_value_be_optimize(const read_value_node& node) { - if (node.get_users().size() == 1) + std::unordered_set unique_users(node.get_users().begin(), node.get_users().end()); + if (unique_users.size() == 1) return true; - const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) { + const auto non_shape_of_users_count = std::count_if(unique_users.begin(), unique_users.end(), [](const program_node* user) { return !user->is_type(); }); if (non_shape_of_users_count <= 1) @@ -877,18 +878,39 @@ void prepare_buffer_fusing::run(program& p) { node.set_output_layout(kv_out_layout); node.can_share_buffer(false); - auto update_dep = [&info_dynamic_pad](program_node* dep) { - auto prev_layout = dep->get_output_layout(); + auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) { + auto prev_layout = dep->get_output_layout(true, idx); prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; - dep->set_output_layout(prev_layout); + dep->set_output_layout(prev_layout, true, idx); dep->can_share_buffer(false); }; + auto update_scale_zp = [&](size_t kv_cache_output_idx, size_t read_value_output_idx) { + auto scales_out_layout = node.get_output_layout(false, kv_cache_output_idx); + + const size_t scales_zp_concat_axis = 2; + padding::DynamicDimsMask info_dynamic_pad_scales; + info_dynamic_pad_scales[scales_zp_concat_axis] = 1; + scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales; + node.set_output_layout(scales_out_layout, true, kv_cache_output_idx); + + update_dep(rv_prim, info_dynamic_pad_scales, read_value_output_idx); + }; + if (rv_prim) { - update_dep(rv_prim); + update_dep(rv_prim, info_dynamic_pad, 0); } if (gather_prim) { - update_dep(gather_prim); + update_dep(gather_prim, info_dynamic_pad, 0); + } + + const auto& desc = node.get_primitive(); + if (desc->compressed) { + update_scale_zp(2, 1); + + if (desc->get_compression_zp_inputs_num() > 0) { + update_scale_zp(3, 2); + } } } }); @@ -922,7 +944,7 @@ void prepare_buffer_fusing::run(program& p) { // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful // topological sort (i.e. if we ensure that all read_value users are completed before assign is run) node.can_be_optimized(can_read_value_be_optimize(node)); - GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized = " << node.can_be_optimized() << std::endl; }); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp index 5692b6037a09e0..da7e95c4ab74a5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp @@ -6,6 +6,8 @@ #include "impls/registry/implementation_map.hpp" #include "register.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + namespace cldnn { namespace cpu { @@ -64,7 +66,21 @@ struct read_value_impl : public typed_primitive_impl { } if (!instance.can_be_optimized()) { - return instance.output_memory(0).copy_from(stream, *variable.get_memory(), false); + GPU_DEBUG_TRACE_DETAIL << "Copy variable's memory to new read_value's output buffer\n"; + std::vector res_events; + res_events.push_back(instance.output_memory(0).copy_from(stream, *variable.get_memory(), false)); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_state->get_memory(), false)); + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *zp_state->get_memory(), false)); + } + } + + return stream.aggregate_events(res_events, res_events.size() > 1); } return instance.get_network().get_stream().create_user_event(true); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index c3d436eb9c9b8d..0c212882f9dbbb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -34,7 +34,6 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { - /// TODO: handle group_size here auto params = get_default_params(impl_param, is_shape_agnostic); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); @@ -45,6 +44,16 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { auto& fc_node = user_node->as(); params.fc_ifm_size = fc_node.weights().get_output_layout().feature(); } + + if (impl_param.output_layouts.size() > 2) + params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2))); + + const auto& desc = impl_param.typed_desc(); + params.group_sizes = desc->attrs.group_sizes; + params.scales_output_order = desc->attrs.scales_zp_output_order; + params.use_asymmetric_quantization = desc->attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + params.combine_scales_and_zp = desc->attrs.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + return params; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index e4e4adfbb15452..d0fcace0b3f184 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -9,12 +9,17 @@ #include "multi_stage_primitive.hpp" #include "kv_cache_inst.h" +#include "dynamic_quantize_inst.h" #include "concatenation/concatenation_kernel_selector.h" #include "concatenation/concatenation_kernel_base.h" #include "beam_table_update/beam_table_update_kernel_selector.hpp" #include "beam_table_update/beam_table_update_kernel_ref.hpp" +#include "dynamic_quantize/dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize/dynamic_quantize_kernel_kv_cache.h" #include "openvino/core/dimension.hpp" +#include + namespace cldnn { namespace ocl { @@ -57,6 +62,9 @@ struct kv_cache_impl : multi_stage_primitive { using bt_kernel_selector_t = kernel_selector::beam_table_update_kernel_selector; using bt_kernel_params_t = kernel_selector::beam_table_update_params; + using dq_kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector; + using dq_kernel_params_t = kernel_selector::dynamic_quantize_params; + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl) std::unique_ptr clone() const override { @@ -65,6 +73,9 @@ struct kv_cache_impl : multi_stage_primitive { const size_t concat_stage = 0; const size_t beam_table_stage = 1; + const size_t dq_stage = 2; + const size_t scale_concat_stage = 3; + const size_t zp_concat_stage = 4; cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; @@ -75,16 +86,30 @@ struct kv_cache_impl : multi_stage_primitive { auto& kernel_selector = kernel_selector_t::Instance(); auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName); kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto& bt_kernel_selector = bt_kernel_selector_t::Instance(); auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]); } + + if (_kernels_data.size() >= 3) { + auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); + auto dq_kernel_impl = dq_kernel_selector.GetImplementation(_kernels_data[dq_stage].kernelName); + dq_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[dq_stage]); + } + + if (_kernels_data.size() >= 4) { + auto& scale_zp_concat_kernel_selector = kernel_selector_t::Instance(); + auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[scale_concat_stage].kernelName); + scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_concat_stage]); + } } } void set_arguments_impl(kv_cache_inst& instance) override {} kernel_arguments_data get_arguments(const kv_cache_inst& instance, size_t stage) const override { + // input buffers order: [past, new, (beam_table), (past_scale), (past_zp)] + // output buffers order: [current, (beam_table), (current_scale), (current_zp)] kernel_arguments_data args; args.shape_info = instance.shape_info_memory_ptr(); if (stage == concat_stage) { @@ -93,12 +118,27 @@ struct kv_cache_impl : multi_stage_primitive { } else if (stage == beam_table_stage) { args.inputs = { beam_table_prev, instance.input_memory_ptr(2) }; args.outputs = { beam_table_new }; + } else if (stage == dq_stage) { + args.inputs = { instance.input_memory_ptr(1) }; + args.outputs = { instance.output_memory_ptr(0) }; + for (size_t i = 2; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + } else if (stage == scale_concat_stage) { + args.inputs = { instance.input_memory_ptr(3) }; + args.outputs = { instance.output_memory_ptr(2) }; + } else if (stage == zp_concat_stage) { + args.inputs = { instance.input_memory_ptr(4) }; + args.outputs = { instance.output_memory_ptr(3) }; } - return args; } - void execute_stage(const std::vector& events, kv_cache_inst& instance, std::vector& all_events, size_t stage) { + void execute_stage(const std::vector& events, + kv_cache_inst& instance, + std::vector& all_events, + size_t stage, + size_t arguments_set = SIZE_MAX) { stream& stream = instance.get_network().get_stream(); std::vector tmp_events(events); size_t kernel_offset = 0; @@ -114,7 +154,8 @@ struct kv_cache_impl : multi_stage_primitive { bool needs_completion_event = instance.needs_completion_event(); auto& params = _kernels_data[stage].kernels[kd_idx].params; - auto args = get_arguments(instance, stage); + auto custom_arguments = arguments_set == SIZE_MAX ? stage : arguments_set; + auto args = get_arguments(instance, custom_arguments); args.scalars = ¶ms.scalars; for (const auto& m : instance.get_intermediates_memories()) { @@ -183,18 +224,58 @@ struct kv_cache_impl : multi_stage_primitive { beam_table_state->set(); } + if (desc->compressed) { + // Copy scales to the new buffer if needed + execute_stage(events, instance, res_events, scale_concat_stage, scale_concat_stage); + + if (desc->get_compression_zp_inputs_num() > 0) { + // Copy zero points to the new buffer if needed + execute_stage(events, instance, res_events, scale_concat_stage, zp_concat_stage); + } + + // Perform dynamic quantization of new token data and append result to the KV-cache + auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[dq_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_stage]); + execute_stage(events, instance, res_events, dq_stage); + + auto compressed_cache_variable = dynamic_cast(&variable); + compressed_cache_variable->get_compression_scale_state()->set(); + + if (desc->get_compression_zp_inputs_num() > 0) { + compressed_cache_variable->get_compression_zp_state()->set(); + } + } + variable.set(); if (can_be_optimized) { GPU_DEBUG_TRACE_DETAIL << desc->id << " : Output is same as variable memory! Skip copying " << std::endl; // When primitive is optimized, concat kernel writes directly to variable memory return stream.aggregate_events(res_events, res_events.size() > 1); } else { - // Othwerise, we need to copy result from out buffer to state memory - GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable meomry" << std::endl; + // Otherwise, we need to copy result from out buffer to state memory + GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable memory" << std::endl; stream.enqueue_barrier(); + + std::vector res_events; auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout()); - return variable.get_memory()->copy_from(stream, *out, false); + res_events.push_back(variable.get_memory()->copy_from(stream, *out, false)); + + if (desc->compressed) { + auto compressed_cache_variable = dynamic_cast(&variable); + + auto scale_state = compressed_cache_variable->get_compression_scale_state(); + auto out_scale_mem = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(2), scale_state->get_memory()->get_layout()); + res_events.push_back(scale_state->get_memory()->copy_from(stream, *out_scale_mem, false)); + + if (desc->get_compression_zp_inputs_num() > 0) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + auto out_zp_mem = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(3), zp_state->get_memory()->get_layout()); + res_events.push_back(zp_state->get_memory()->copy_from(stream, *out_zp_mem, false)); + } + } + + return stream.aggregate_events(res_events, res_events.size() > 1); } } @@ -264,10 +345,14 @@ struct kv_cache_impl : multi_stage_primitive { params.is_state_set = is_state_set; params.indirect_axis = indirect_axis; - const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]] - const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + const auto& desc = impl_param.typed_desc(); + const auto compression_inputs = desc->get_compression_scales_inputs_num() + desc->get_compression_zp_inputs_num(); + const auto beam_table_past_idx = 3 + compression_inputs; + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, [scale_past], [zp_past], beam_table_past]] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present] std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(3)}, // beam_table_past + {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past {1, in_offsets_map.at(2)}, // beam_idx }; std::map out_tensor_to_offset_map = { @@ -279,17 +364,110 @@ struct kv_cache_impl : multi_stage_primitive { return params; } + static dq_kernel_params_t get_dq_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + + params.append_axis = primitive->concat_axis; + params.group_sizes = primitive->quantization_attributes.group_sizes; + params.scales_output_order = primitive->quantization_attributes.scales_zp_output_order; + params.use_asymmetric_quantization = + primitive->quantization_attributes.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + params.combine_scales_and_zp = + primitive->quantization_attributes.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + + const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape(); + params.axis_offset = past_kv_cache_shape[primitive->concat_axis].is_static() ? past_kv_cache_shape[primitive->concat_axis].get_length() : 0; + + auto inputs_count = 1; + auto outputs_count = 2; + params.inputs.resize(inputs_count); + params.outputs.resize(outputs_count); + + auto current_token_layout = impl_param.input_layouts[1]; + auto present_layout = impl_param.output_layouts[0]; + auto present_scales_layout = impl_param.output_layouts[2]; + params.inputs[0] = convert_data_tensor(current_token_layout); + params.outputs[0] = convert_data_tensor(present_layout); + params.outputs[1] = convert_data_tensor(present_scales_layout); + + const bool has_zp_output_buffer = primitive->get_compression_zp_inputs_num() > 0; + if (has_zp_output_buffer) { + auto present_zp_layout = impl_param.output_layouts[3]; + params.outputs.resize(outputs_count + 1); + params.outputs[2] = convert_data_tensor(present_zp_layout); + } + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(1)}, // kv_new_token + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, // compressed_kv_present + {1, out_offsets_map.at(2)}, // compression_scale_present + }; + + if (has_zp_output_buffer) { + out_tensor_to_offset_map.emplace(2, out_offsets_map.at(3)); // compression_zp_present + } + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + + static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + auto params = get_default_params(impl_param, is_shape_agnostic); + + const auto concat_axis = 2; + params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); + + auto inputs_count = 1; + auto comp_scale_past_layout = impl_param.input_layouts[3]; + auto comp_scale_present_layout = impl_param.output_layouts[2]; + + params.inputs.resize(inputs_count); + params.inputs[0] = convert_data_tensor(comp_scale_past_layout); + params.outputs[0] = convert_data_tensor(comp_scale_present_layout); + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(3)}, // compression_scale_past + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(2)}, // compression_scale_present + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { std::vector kernels_data; auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_kernel_selector = kernel_selector_t::Instance(); kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params)); const bool indirect = impl_param.typed_desc()->indirect; + const bool compressed = impl_param.typed_desc()->compressed; if (indirect) { auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params)); } + + if (compressed) { + auto dq_kernel_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); + kernels_data.push_back(dq_kernel_selector.get_best_kernel(dq_kernel_params)); + + auto concat_scale_zp_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& concat_scale_zp_kernel_selector = kernel_selector_t::Instance(); + kernels_data.push_back(concat_scale_zp_kernel_selector.get_best_kernel(concat_scale_zp_kernel_params)); + } return cldnn::make_unique(kernels_data); } @@ -307,13 +485,26 @@ struct kv_cache_impl : multi_stage_primitive { (_kernels_data[concat_stage].update_dispatch_data_func)(params, _kernels_data[concat_stage]); _kernels_data[concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0; + + if (impl_param.typed_desc()->compressed) { + // In case of KV-cache with compression enabled, skip second concat's kernel as new token data append will + // be handled by dynamic quantization kernel + // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or + // variables memory was reallocated and we have to copy past KV-cache to new memory) + _kernels_data[concat_stage].kernels[1].skip_execution = true; + + // Update dynamic quantization parameters + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[scale_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_concat_stage]); + _kernels_data[scale_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(3).count() == 0; + } } }; namespace detail { attach_kv_cache_impl::attach_kv_cache_impl() { - auto types = { data_types::f16, data_types::f32 }; + auto types = { data_types::i8, data_types::f16, data_types::f32 }; auto formats = { format::bfyx }; implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index b33871110ec6b4..f4791d38f88742 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -217,6 +217,18 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_causal; + if (desc->is_kv_compressed) { + const auto& group_sizes = desc->quantization_attributes.group_sizes; + const auto non_compressed_dims = std::count(group_sizes.begin(), group_sizes.end(), 1); + + config.per_head_quantization = (group_sizes.size() - non_compressed_dims) == 1; + config.is_kv_compressed = desc->is_kv_compressed; + config.use_asymmetric_quantization = + desc->quantization_attributes.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + config.combine_scales_and_zp = + desc->quantization_attributes.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + } + return config; } @@ -229,6 +241,14 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_kv_compressed) { + data_inputs_num -= 2; // key and value compression scales are handled separately + + if (desc->get_compression_zp_inputs_num() > 0) + data_inputs_num -= 2; // key and value compression zp are handled separately + } + params.inputs.resize(data_inputs_num); for (size_t i = 0; i < data_inputs_num; i++) { params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i)); @@ -246,15 +266,41 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveindirect_axis; } - params.set_dynamic_shape_offsets(); + if (desc->is_kv_compressed) { + params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num)); + params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1)); - // Need to adjust sdpa kernel offset to consider beam table input - if (has_indirect_inputs(impl_param)) { - auto out_offset = params.outputs[0].get_dynamic_shape_offset(); - if (indirect) - params.beam_table.SetDynamicShapeOffset(out_offset); + if (has_zp_input_buffers) { + params.key_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 2)); + params.value_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 3)); + } + } - params.outputs[0].SetDynamicShapeOffset(out_offset + kernel_selector::DataTensor::max_rank()); + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + std::map in_tensor_to_offset_map; + for (size_t i = 0; i < data_inputs_num; i++) { + in_tensor_to_offset_map[i] = in_offsets_map.at(i); + } + + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + if (desc->is_kv_compressed) { + params.key_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num)); + params.value_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 1)); + + if (has_zp_input_buffers) { + params.key_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 2)); + params.value_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 3)); + } + } + + if (indirect && has_indirect_inputs(impl_param)) { + params.beam_table.SetDynamicShapeOffset(get_beam_table_id(desc)); } return params; @@ -300,6 +346,7 @@ attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_im auto types = { data_types::f32, data_types::f16, + data_types::i8, }; auto formats = { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 49dd62c6332549..f96085094ae221 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,8 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, + const dynamic_quantize::Attributes& config); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index f3aa4de5ec34e1..da0a9397433f89 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -62,6 +62,12 @@ class typed_primitive_inst : public typed_primitive_inst_base= 0 ? sequence_axis : past_layout_rank + sequence_axis; } + static int64_t get_scale_zp_sequence_axis() { + // The order of scales and zero points is fixed, so use constant axis + const auto scale_zp_concat_axis = 2; + return scale_zp_concat_axis; + } + static int64_t get_max_pad(const layout& target_layout, size_t buffer_size, int64_t sequence_axis, std::string target_name = "") { if (buffer_size == 0) return 0; diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index ba5363f09f194b..8105a8bc07dec3 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -237,7 +237,7 @@ struct program_node { } void merge_output_padding(padding const& padd, size_t idx = 0) { - set_output_padding(padding::max(padd, output_layouts[idx].data_padding)); + set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx); } // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index 74f9ffff581b87..ea0c8b82bb21fa 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -33,9 +33,14 @@ class typed_primitive_inst : public typed_primitive_inst_base static std::vector calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); - const auto& default_layout = desc->output_layout; + std::vector output_layouts; - return { impl_param.state_layout.value_or(default_layout) }; + for (size_t i = 0; i < desc->num_outputs; i++) { + const auto& default_layout = desc->output_layouts[i]; + output_layouts.push_back(impl_param.state_layouts.size() > i ? impl_param.state_layouts[i] : default_layout); + } + + return output_layouts; } static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 66a874b9b153ec..808a593c601ad0 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/kv_cache_compressed.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/runtime/optionals.hpp" @@ -29,19 +30,39 @@ template std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*node*/, kernel_impl_params const& impl_param) { auto desc = impl_param.typed_desc(); - ov::intel_gpu::op::KVCache op; - op.set_output_size(desc->num_outputs); - op.set_concat_axis(desc->concat_axis); - op.set_gather_axis(desc->gather_axis); - std::vector input_shapes = {impl_param.get_input_layout(0).get(), impl_param.get_input_layout(1).get()}; - if (desc->num_outputs > 1) + if (desc->indirect) { input_shapes.push_back(impl_param.get_input_layout(2).get()); + } - std::vector output_shapes = shape_infer(&op, input_shapes); + if (desc->compressed) { + input_shapes.push_back(impl_param.get_input_layout(3).get()); + + if (desc->get_compression_zp_inputs_num() > 0) { + input_shapes.push_back(impl_param.get_input_layout(4).get()); + } + } + + std::vector output_shapes; + if (desc->compressed) { + ov::intel_gpu::op::KVCacheCompressed op; + op.set_output_size(desc->num_outputs); + op.set_concat_axis(desc->concat_axis); + op.set_gather_axis(desc->gather_axis); + op.set_quantization_attrs(desc->quantization_attributes); + + output_shapes = shape_infer(&op, input_shapes); + } else { + ov::intel_gpu::op::KVCache op; + op.set_output_size(desc->num_outputs); + op.set_concat_axis(desc->concat_axis); + op.set_gather_axis(desc->gather_axis); + + output_shapes = shape_infer(&op, input_shapes); + } - static const std::map ports_map = {{0, 0}, {1, 2}}; + static const std::map ports_map = {{0, 0}, {1, 2}, {2, 3}, {3, 4}}; std::vector out_layouts; for (size_t i = 0; i < desc->num_outputs; i++) { @@ -64,6 +85,9 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { kv_cache_info.add("concat axis", node.get_primitive()->concat_axis); kv_cache_info.add("gather axis", node.get_primitive()->gather_axis); kv_cache_info.add("indirect", node.get_primitive()->indirect); + kv_cache_info.add("compressed", node.get_primitive()->compressed); + kv_cache_info.add("output_storage_type", static_cast(node.get_primitive()->quantization_attributes.output_storage_type)); + kv_cache_info.add("scales_zp_output_order", node.get_primitive()->quantization_attributes.scales_zp_output_order); node_info->add("kv_cache info", kv_cache_info); std::stringstream primitive_description; node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index e5c3f8ca89ef9e..5798f2ca19205f 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -40,6 +40,7 @@ #include "graph_optimizer/prepare_buffer_fusing.h" #include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/graph/network.hpp" #include "intel_gpu/graph/serialization/set_serializer.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -294,30 +295,54 @@ void primitive_inst::update_shape() { auto prim = get_node().as().get_primitive(); const auto& variable_id = prim->variable_id; auto& variable = get_network().get_variable(variable_id); - // Initial variable shape is taken from variable itself - auto new_layout = variable.get_layout(); - // If variable is not set and we have an initializer - use it's shape as shape of variable - if (!variable.is_set() && _impl_params->input_layouts.size() == 1) { - new_layout = _impl_params->get_input_layout(0); - } + auto update_state_layout = [&](ov::intel_gpu::VariableStateBase& variable, layout new_layout, size_t layout_idx) { + // If variable is not set and we have an initializer - use it's shape as shape of variable + if (!variable.is_set() && _impl_params->input_layouts.size() > layout_idx) { + new_layout = _impl_params->get_input_layout(layout_idx); + } - // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 - if (new_layout.is_dynamic()) { - auto pshape = new_layout.get_partial_shape(); - for (auto& d : pshape) { - if (d.is_dynamic()) { - d = 0; + // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 + if (new_layout.is_dynamic()) { + auto pshape = new_layout.get_partial_shape(); + for (auto& d : pshape) { + if (d.is_dynamic()) { + d = 0; + } } + new_layout.set_partial_shape(pshape); } - new_layout.set_partial_shape(pshape); - } - variable.set_layout(new_layout); + variable.set_layout(new_layout); - if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { - _impl_params->state_layout = new_layout; - input_shape_changed = true; + if (_impl_params->state_layouts[layout_idx] != new_layout) { + _impl_params->state_layouts[layout_idx] = new_layout; + GPU_DEBUG_TRACE_DETAIL << "Update " << layout_idx << " layout: " << new_layout.to_short_string() << "\n"; + input_shape_changed = true; + } + }; + + if (_impl_params->state_layouts.empty()) + _impl_params->state_layouts.resize(1); + + // Initial variable shape is taken from variable itself + auto new_layout = variable.get_layout(); + update_state_layout(variable, new_layout, 0); + + if (prim->num_outputs > 1) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->state_layouts.resize(compressed_cache_variable->has_zp_state() ? 3 : 2); + + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + auto new_scales_layout = compressed_cache_variable->get_compression_scale_state()->get_layout(); + update_state_layout(*scales_state, new_scales_layout, 1); + + if (compressed_cache_variable->has_zp_state()) { + auto scales_state = compressed_cache_variable->get_compression_zp_state(); + auto new_zp_layout = compressed_cache_variable->get_compression_zp_state()->get_layout(); + update_state_layout(*scales_state, new_zp_layout, 2); + } + } } } @@ -462,6 +487,14 @@ void primitive_inst::update_shape() { auto& variable = get_network().get_variable(desc->variable_id); // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern _impl_params->output_layouts[0] = variable.get_layout(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->output_layouts[1] = compressed_cache_variable->get_compression_scale_state()->get_layout(); + + if (compressed_cache_variable->has_zp_state()) { + _impl_params->output_layouts[2] = compressed_cache_variable->get_compression_zp_state()->get_layout(); + } + } } if (get_node().is_type()) { @@ -563,6 +596,15 @@ event::ptr primitive_inst::realloc_if_needed() { << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; _outputs[0] = variable.get_memory(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } + } + // To record shape predictor for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j) sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j); @@ -582,11 +624,27 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable (ptr: " << variable.get_memory()->buffer_ptr() << ", actual_size:" << variable.get_actual_mem_size() << " bytes" << ", variable layout:" << variable.get_layout().to_short_string() << ")" << std::endl; + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + compressed_cache_variable->get_compression_scale_state()->set_layout(_impl_params->output_layouts[1]); + + if (compressed_cache_variable->has_zp_state()) { + compressed_cache_variable->get_compression_zp_state()->set_layout(_impl_params->output_layouts[2]); + } + } } // For nodes that can be optimized, variable memory is used as output memory // so there is no need for output memory reallocation if (can_be_optimized()) { _max_output_layout_count[0] = variable.get_actual_mem_size() / dt_sizes_in_B[0]; + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + const size_t scale_idx = _node->is_type() ? 1 : 2; // kv_cache or read_value + _max_output_layout_count[scale_idx] = compressed_cache_variable->get_compression_scale_state()->get_actual_mem_size() / dt_sizes_in_B[1]; + if (compressed_cache_variable->has_zp_state()) { + _max_output_layout_count[scale_idx + 1] = compressed_cache_variable->get_compression_zp_state()->get_actual_mem_size() / dt_sizes_in_B[2]; + } + } GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized"); return ev; } @@ -665,7 +723,10 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + const auto& desc = get_node().as().get_primitive(); + auto dyn_quan_scale_layout = + dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], + desc->attrs); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; @@ -690,13 +751,24 @@ event::ptr primitive_inst::realloc_if_needed() { // update layout to ensure that it repsects paddings for correct allocation size if (_node_output_layout.data_padding.is_dynamic()) { - auto current_dims = updated_layouts[0].get_padded_dims(); + auto update_padding = [](layout& orig_layout) { + auto current_dims = orig_layout.get_padded_dims(); + + std::vector current_buf_shape; + current_buf_shape.reserve(current_dims.size()); + std::transform(current_dims.begin(), current_dims.end(), + std::back_inserter(current_buf_shape), [](const tensor::value_type& el) { return static_cast(el); }); + orig_layout = layout(ov::PartialShape(current_buf_shape), orig_layout.data_type, orig_layout.format); + }; + + update_padding(updated_layouts[0]); - std::vector current_buf_shape; - current_buf_shape.reserve(current_dims.size()); - std::transform(current_dims.begin(), current_dims.end(), - std::back_inserter(current_buf_shape), [](const tensor::value_type& el) { return static_cast(el); }); - updated_layouts[0] = layout(ov::PartialShape(current_buf_shape), updated_layouts[0].data_type, updated_layouts[0].format); + // Update scales and zero points buffers paddings, skipping beam_table + if (_node->is_type()) { + for (size_t i = 2; i < updated_layouts.size(); ++i) { + update_padding(updated_layouts[i]); + } + } } int32_t tmp_prealloc_count = get_prealloc_iter_num(); @@ -709,13 +781,14 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < updated_layouts.size(); ++i) { bool reclaim = 0; size_t required_buffer_size = 0; - if (_node->is_type() && i == 0) { + if (_node->is_type() && i != 1) { // Relax reclaiming condition for kv cache const auto& desc = _node->as().get_primitive(); auto prealloc_shape = updated_layouts[i].get_shape(); const auto shape_rank = prealloc_shape.size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + const auto seq_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank) + : kv_cache_inst::get_scale_zp_sequence_axis(); + prealloc_shape[seq_axis] += tmp_prealloc_count; required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies()); } else { @@ -742,11 +815,12 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < actual_layouts.size(); ++i) { bool can_reuse_buffer = (_outputs[i] && updated_layouts[i].get_linear_size() <= _max_output_layout_count[i]); std::pair prealloc_info; - if (_node->is_type() && i == 0) { + if (_node->is_type() && i != 1) { const auto& desc = _node->as().get_primitive(); - auto shape_rank = updated_layouts[i].get_shape().size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + const auto shape_rank = updated_layouts[i].get_shape().size(); + const auto seq_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank) + : kv_cache_inst::get_scale_zp_sequence_axis(); + prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis); } else { prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count); @@ -762,19 +836,21 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - " << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i] << std::endl; - if (_node->is_type() && (i == 0)) { + if (_node->is_type() && i != 1) { // kv_cache has already assigned memory. // No need to reinterpret output memory but need to update padding const auto& desc = _node->as().get_primitive(); auto& present_layout = _impl_params->output_layouts[i]; const auto present_layout_rank = present_layout.get_partial_shape().size(); - const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank); + const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) + : kv_cache_inst::get_scale_zp_sequence_axis();; + auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[i], sequence_axis, - "present_layout"); + i == 0 ? "present_layout" : "present_scales_layout"); kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); - GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl; set_shape_change(); } else { _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]); @@ -835,6 +911,26 @@ event::ptr primitive_inst::realloc_if_needed() { sequence_axis, "present_layout"); if (max_pad > 0) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis();; + + // In case of compressed KV-cache, calling update_impl for each iteration + // because of scales layout [batch, num_heads, seq_len, head_size], which requires proper + // dynamic padding updates + axis_is_outer_most = false; + kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis); + + _impl_params->output_layouts[2] = present_scales_layout; + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + + _impl_params->output_layouts[3] = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } + kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); if (!axis_is_outer_most) { GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl; @@ -855,12 +951,32 @@ event::ptr primitive_inst::realloc_if_needed() { << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_memory(_outputs[0], present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } } } else { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name() << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_layout(present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_layout(present_zp_layout); + } + } } } @@ -1036,8 +1152,8 @@ bool primitive_inst::update_impl(bool use_async_compilation) { } void primitive_inst::update_paddings() { - auto reset_pad = [](kernel_impl_params& params, const program_node* node) { - params.output_layouts[0].data_padding = node->get_output_layout(0).data_padding; + auto reset_pad = [](kernel_impl_params& params, const program_node* node, size_t idx = 0) { + params.output_layouts[idx].data_padding = node->get_output_layout(idx).data_padding; }; if (_node->is_type() || _node->is_type()) { auto variable_id = _node->is_type() ? (_node->as().get_primitive()->variable_id) @@ -1049,6 +1165,15 @@ void primitive_inst::update_paddings() { primitive_inst* inst = this; while (inst) { reset_pad(*inst->_impl_params, inst->_node); + if (inst == this) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + const size_t scale_idx = _node->is_type() ? 1 : 2; + reset_pad(*inst->_impl_params, inst->_node, scale_idx); + if (compressed_cache_variable->has_zp_state()) { + reset_pad(*inst->_impl_params, inst->_node, scale_idx + 1); + } + } + } auto& users = inst->_node->get_users(); if (users.size() == 1 && users.front()->get_output_layout(0).data_padding.is_dynamic()) { inst = inst->get_user_insts().front(); @@ -1174,11 +1299,42 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl; auto& variable = get_network().get_variable(desc->variable_info.variable_id); variable.set_layout(present_layout); + + if (desc->compressed) { + auto compressed_cache_variable = dynamic_cast(&variable); + auto& present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis(); + kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() + << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (desc->get_compression_zp_inputs_num() > 0) { + auto& present_zp_layout = _impl_params->output_layouts[3]; + kv_cache_inst::update_pad(present_zp_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() + << " Updated present_zp_layout's pad : " << present_scales_layout.to_string() << std::endl; + + compressed_cache_variable->get_compression_zp_state()->set_layout(present_scales_layout); + } + } + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout" << variable.get_layout().to_string() << " is_set = " << variable.is_set() << std::endl; if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) { kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis); _impl_params->_can_be_optimized = true; + + if (desc->compressed) { + auto& past_scale_layout = _impl_params->input_layouts[3]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis(); + kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis); + + if (desc->get_compression_zp_inputs_num() > 0) { + auto& past_zp_layout = _impl_params->input_layouts[4]; + kv_cache_inst::update_pad(past_zp_layout, max_pad, sequence_axis); + } + } GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl; } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index fc9648b90e444c..a9bb7c665f177b 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -90,7 +90,8 @@ void program_node::replace_dependency(size_t idx, std::pair const program_node::get_input_layouts() const { std::vector layouts; for (size_t i = 0; i < dependencies.size(); i++) { - layouts.push_back(get_input_layout(i)); + auto input_layout = get_input_layout(i); + layouts.push_back(input_layout); } return layouts; } diff --git a/src/plugins/intel_gpu/src/graph/read_value.cpp b/src/plugins/intel_gpu/src/graph/read_value.cpp index bf6e730e8a808b..1d6657b9bf8ac4 100644 --- a/src/plugins/intel_gpu/src/graph/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/read_value.cpp @@ -2,8 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "read_value_inst.h" #include "primitive_type_base.h" + +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + #include #include @@ -16,7 +19,7 @@ read_value_inst::typed_primitive_inst(network& network, const read_value_node& n } layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) { - return impl_param.typed_desc()->output_layout; + return impl_param.typed_desc()->output_layouts[0]; } std::string read_value_inst::to_string(const read_value_node& node) { @@ -45,5 +48,25 @@ void read_value_inst::update_output_memory() { GPU_DEBUG_TRACE_DETAIL << " - layout " << variable.get_layout().to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << " - actual_size " << variable.get_actual_mem_size() << " bytes" << std::endl; set_output_memory(variable.get_memory(), false, 0); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + set_output_memory(scales_state->get_memory(), false, 1); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_state->get_actual_mem_size() << " bytes" << std::endl; + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + set_output_memory(zp_state->get_memory(), false, 2); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << zp_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << zp_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << zp_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << zp_state->get_actual_mem_size() << " bytes" << std::endl; + } + } } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp index e8e213ad97011a..e80cb62a534b52 100644 --- a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp @@ -11,6 +11,7 @@ #include "scaled_dot_product_attention_shape_inference.hpp" #include "intel_gpu/op/sdpa.hpp" +#include "ov_ops/dynamic_quantize.hpp" namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(scaled_dot_product_attention) @@ -87,6 +88,14 @@ std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_atte json_composite scaled_dot_product_attention_info; scaled_dot_product_attention_info.add("input id", input.id()); scaled_dot_product_attention_info.add("is_causal", desc->is_causal); + scaled_dot_product_attention_info.add("is_kv_compressed", desc->is_kv_compressed); + scaled_dot_product_attention_info.add("output_storage_type", static_cast(node.get_primitive()->quantization_attributes.output_storage_type)); + scaled_dot_product_attention_info.add("group_size", desc->quantization_attributes.group_sizes); + scaled_dot_product_attention_info.add("quantization_type", static_cast(node.get_primitive()->quantization_attributes.quantization_type)); + scaled_dot_product_attention_info.add("quantization_dt", desc->quantization_attributes.quantization_dt); + scaled_dot_product_attention_info.add("scale_dt", desc->quantization_attributes.scale_dt); + scaled_dot_product_attention_info.add("zp_dt", desc->quantization_attributes.zp_dt); + scaled_dot_product_attention_info.add("indirect_axis", desc->indirect_axis); scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input); scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input); scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl new file mode 100644 index 00000000000000..22a2f03c94564a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl @@ -0,0 +1,121 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" +#include "include/batch_headers/sub_group_shuffle.cl" + + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + +#define SUBGROUP_SIZE 16 +#define INNERMOST_DIM_VALUE INPUT0_SIZE_X +#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset) +#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val) + +__attribute__((reqd_work_group_size(SUBGROUP_SIZE, SUBGROUPS_NUMBER, 1))) +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(dynamic_quantize_gpu_kv_cache)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT2_TYPE* output_zp +#endif +#ifdef APPEND_MODE + , const uint axis_offset +#endif + ) +{ + const uint sglid = get_sub_group_local_id(); + const uint grouped_indexes = get_global_id(1); + const uint batch_indexes = get_global_id(2); + + DECLARE_BATCHED_DIMS_INDEXES(batch_indexes); + DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes); + + // The innermost dimension is always processed in the loop inside the kernel + const uint x = 0; + + half max_value = INPUT0_VAL_MIN; + half min_value = INPUT0_VAL_MAX; + + half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; + + const uint input_offset = INPUT0_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { + val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE); +#if ASYMMETRIC_QUANTIZATION + max_value = fmax(max_value, val[i]); + min_value = fmin(min_value, val[i]); +#else + max_value = fmax(max_value, fabs(val[i])); +#endif + } + +#if ASYMMETRIC_QUANTIZATION + min_value = work_group_reduce_min(min_value); + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; +#else + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif + +#ifdef APPEND_MODE + APPEND_AXIS_NAME += axis_offset; +#endif + + const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { +#if ASYMMETRIC_QUANTIZATION + OUTPUT_TYPE res = convert_char_rte(val[i] * scale + zp); +#else + OUTPUT_TYPE res = convert_char_rte(val[i] * scale); +#endif + OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res); + } + + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); + + if (grouped_indexes == 0 && sglid == 0) { +#if ASYMMETRIC_QUANTIZATION + output_scale[scale_idx] = 1.0h / scale; +#if GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#else + output_zp[scale_idx] = zp; +#endif +#else + output_scale[scale_idx] = 1.0h / scale; +#endif + } +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 858571fea71914..62482b8b9b5047 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -8,48 +8,143 @@ #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + KERNEL(dynamic_quantize_gpu_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, - __global OUTPUT1_TYPE* output_scale) + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT2_TYPE* output_zp +#endif +) { const uint bf = (uint)get_global_id(0); - const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; - const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint b = bf / INPUT0_FEATURE_NUM; + const uint f = bf % INPUT0_FEATURE_NUM; const uint y = (uint)get_global_id(1); - const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + const uint x = (uint)get_global_id(2); +#ifdef SCALES_OUTPUT_ORDER + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#else + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); +#endif - half max_val = 0.0001h; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + half max_val = INPUT0_VAL_MIN; + half min_val = INPUT0_VAL_MAX; + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + half val = input[offset]; +#if ASYMMETRIC_QUANTIZATION + max_val = fmax(max_value, val); + min_val = fmin(min_value, val); +#else + half abs_val = fabs(val); + max_val = fmax(max_val, abs_val); +#endif +#else + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); half8 abs_val = fabs(val); - - for (int j = 0; j < 8; j++) + for (int j = 0; j < 8; j++) { +#if ASYMMETRIC_QUANTIZATION + max_val = fmax(max_val, val[j]); + min_val = fmin(min_val, val[j]); +#else max_val = fmax(max_val, abs_val[j]); +#endif + } } x *= 8; - for (; x < INPUT0_SIZE_X; x++) - max_val = fmax(max_val, fabs(input[offset + x])); + for (; x < INPUT0_SIZE_X; x++) { + half val = input[offset + x]; +#if ASYMMETRIC_QUANTIZATION + max_val = fmax(max_val, val); + min_val = fmin(min_val, val); +#else + max_val = fmax(max_val, fabs(val)); +#endif + } +#endif + } + } } - half scale = 127.0h / max_val; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); - const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); +#if ASYMMETRIC_QUANTIZATION + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_val - min_val)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) - CHAR_MAX; +#else + max_val = work_group_reduce_max(max_val); + OUTPUT1_TYPE scale = 127.0h / max_val; +#endif + + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + + half val = input[in_offset]; +#if ASYMMETRIC_QUANTIZATION + val *= scale; + val += zp; + output[out_offset] = convert_char_rte(val); +#else + val *= scale; + output[out_offset] = convert_char_rte(val); +#endif +#else + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); +#if ASYMMETRIC_QUANTIZATION val *= scale; + val += zp; +#else + val *= scale; +#endif vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8); } x *= 8; - for (; x < INPUT0_SIZE_X; x++) - output[out_offset + x] = convert_char(input[in_offset + x] * scale); + for (; x < INPUT0_SIZE_X; x++) { + half val = input[in_offset + x]; +#if ASYMMETRIC_QUANTIZATION + val *= scale; + val += zp; + output[out_offset + x] = convert_char_rte(val); +#else + val *= scale; + output[out_offset + x] = convert_char_rte(val); +#endif + } +#endif + } + } } output_scale[scale_idx] = 1.0h / scale; +#if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#elif ASYMMETRIC_QUANTIZATION + output_zp[scale_idx] = zp; +#endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 748f79115262e0..8e6be800f37cf0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -118,12 +118,21 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin #define VALUE_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT2_TYPE, 1, ptr, offset) #define SUBGROUPS_PER_WG (HEAD_SIZE * SG_SCALE_FACTOR / SUBGROUP_SIZE) +#if IS_KV_COMPRESSED +#if COMPRESSED_PER_HEAD + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (f), (y), (0)); +#else + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (0), (y), (0)); +#endif +#endif + #ifdef SDPA_STAGE_0 #if TARGET_SEQ_LEN_BLOCK_SIZE == 1 /* This version is used for 2nd token */ REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +__attribute__((reqd_work_group_size(1, 1, HEAD_SIZE * SG_SCALE_FACTOR))) KERNEL(sdpa_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* query_input, @@ -136,6 +145,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -149,7 +162,18 @@ KERNEL(sdpa_opt)( const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ const uint target_seq_idx = get_global_id(1); const uint lid = get_local_id(2); + +#if SG_SCALE_FACTOR == 2 + const uint head_size_idx = lid % HEAD_SIZE; +#elif SG_SCALE_FACTOR == 1 const uint head_size_idx = lid; +#else + #error "sdpa_opt.cl: Unsupported scale factor" +#endif + +#if SUBGROUPS_PER_WG > SUBGROUP_SIZE + #error "sdpa_opt.cl: Number of subgroups per work group should be less than subgroup_size +#endif const uint sgid = get_sub_group_id(); const uint sglid = get_sub_group_local_id(); @@ -199,13 +223,19 @@ KERNEL(sdpa_opt)( uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE)); const uint query_pitch = QUERY_STEP_LOCAL; #endif - for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { - #define QUERY_BLOCK_SIZE 1 +#if SG_SCALE_FACTOR == 2 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#else + { +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + #define QUERY_BLOCK_SIZE 1 - INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); - query_local[query_local_offset] = val * scale_val; - query_local_offset += QUERY_STEP_LOCAL; - query_offset += query_pitch; + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); + query_local[query_local_offset] = val * scale_val; + query_local_offset += QUERY_STEP_LOCAL; + query_offset += query_pitch; + } } #undef QUERY_BLOCK_SIZE #undef QUERY_STEP @@ -216,28 +246,45 @@ KERNEL(sdpa_opt)( // Main Gemm1 calculation loop // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer - for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) { -#ifdef INPUT1_DIMS_ORDER + for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE) * SG_SCALE_FACTOR) { #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0)]; #else const uint b_idx = b0_idx; #endif - const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); + +#ifdef INPUT1_DIMS_ORDER + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); #else - const uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); + uint key_offset = INPUT1_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, 0); #endif INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif uint head_idx_index = 0; #define KEY_BLOCK_SIZE 8 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -258,9 +305,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -281,9 +337,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -304,9 +369,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -335,7 +409,7 @@ KERNEL(sdpa_opt)( const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { // Iterate over all values QK values in SLM and apply scale and attention mask - for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) { + for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE * SG_SCALE_FACTOR)) { // Read value from SLM and apply scale qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; @@ -388,7 +462,7 @@ KERNEL(sdpa_opt)( SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE); for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]); @@ -420,7 +494,7 @@ KERNEL(sdpa_opt)( // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx]; for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx]; @@ -434,7 +508,7 @@ KERNEL(sdpa_opt)( { // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers // Use single WI in the WG, since all the WIs have the same value - if (num_of_partitions > 1 && head_size_idx == 0) { + if (num_of_partitions > 1 && lid == 0) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + @@ -463,15 +537,32 @@ KERNEL(sdpa_opt)( #endif #endif - for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#if SG_SCALE_FACTOR > 1 + const uint seq_len_start = (sgid / (HEAD_SIZE / SUBGROUP_SIZE)) * (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE); + const uint seq_len_end = min(seq_len_start + (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE), partition_seq_len / SUBGROUP_SIZE); +#else + const uint seq_len_start = 0; + const uint seq_len_end = partition_seq_len / SUBGROUP_SIZE; +#endif + + for (uint seq_len = seq_len_start; seq_len < seq_len_end; seq_len++) { #ifdef BEAM_TABLE_TYPE - uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; + const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER - uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #else - uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); + uint value_offset = INPUT2_GET_INDEX(b_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; #endif #endif @@ -482,9 +573,17 @@ KERNEL(sdpa_opt)( unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); +#else + INPUT2_TYPE value_val = value_packed; #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); @@ -496,17 +595,30 @@ KERNEL(sdpa_opt)( } } - const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; - for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) { -#ifdef INPUT2_DIMS_ORDER + +#if SG_SCALE_FACTOR > 1 + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { +#endif + + for (uint seq_len = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; seq_len < partition_seq_len; seq_len++) { #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx)]; #else const uint b_idx = b0_idx; #endif + +#ifdef INPUT2_DIMS_ORDER const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx); #else - const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); + const uint value_offset = INPUT2_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -514,15 +626,42 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; } - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + const VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + const VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * comp_scale); +#else + const INPUT2_TYPE value_val = value_packed; +#endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]); } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) +#endif + +#if SG_SCALE_FACTOR > 1 + if ((partition_seq_len > (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR)) || (partition_seq_len % SUBGROUP_SIZE != 0)) { + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { + // Reuse query_local SLM to sum-up results between two groups of subgroups + query_local[head_size_idx] = acc[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { + acc[0] += query_local[head_size_idx]; + } + } +#endif + // If the number of partitions is greater than 1, save results to the temporary buffer; // otherwise, save results directly to the main output. +#if SG_SCALE_FACTOR > 1 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif if (num_of_partitions > 1) { const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { @@ -542,6 +681,9 @@ KERNEL(sdpa_opt)( output[output_offset] = acc[seq_idx]; } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif } // Gemm2 calculation end } @@ -582,6 +724,12 @@ KERNEL(sdpa_opt)( #define ATTN_SCALE_BUFFER_ARG #endif +// Applying scales to query input improves the accuracy, but leads to performance drop for FP16 KV-cache case, +// so use it only for compressed version +#if IS_KV_COMPRESSED +#define APPLY_SCALES_TO_QUERY 1 +#endif + #define MASK_VECTOR_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) inline MASK_VECTOR_TYPE FUNC(load_attn_mask)(OPTIONAL_SHAPE_INFO_ARG @@ -683,6 +831,10 @@ KERNEL(sdpa_opt)( const __global ALIBI_TYPE* alibi_slopes, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -751,12 +903,22 @@ KERNEL(sdpa_opt)( #endif uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE; +#if APPLY_SCALES_TO_QUERY +#if HAS_SCALE_INPUT + const INPUT0_TYPE scale_val = *scale; +#else + const INPUT0_TYPE scale_val = TO_INPUT0_TYPE(STATIC_SCALE_VALUE); +#endif +#else + const INPUT0_TYPE scale_val = INPUT0_VAL_ONE; +#endif + if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) { if (sgid * SUBGROUP_SIZE < HEAD_SIZE) { for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -767,7 +929,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -777,7 +939,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -788,7 +950,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -796,7 +958,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -841,6 +1003,7 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + sglid, 0)]; const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, seq_len + sglid, 0); #else + const uint b_idx = b0_idx; #ifdef INPUT1_DIMS_ORDER uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len, 0); uint key_offset_next_seq = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + 1, 0); @@ -870,10 +1033,17 @@ KERNEL(sdpa_opt)( PA_BUFFERS); if (seq_len_calc_size >= SUBGROUP_SIZE) { +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) QUERY_VEC queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; @@ -884,9 +1054,17 @@ KERNEL(sdpa_opt)( unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef BEAM_TABLE_TYPE - INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); + const INPUT1_TYPE key_packed = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else - INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); + const INPUT1_TYPE key_packed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_packed) - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_packed) * sub_group_broadcast(comp_scale, key_row_idx)); +#else + INPUT1_TYPE key_vals = key_packed; #endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { @@ -895,12 +1073,29 @@ KERNEL(sdpa_opt)( } } } else if (seq_len_calc_size > 0) { +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + min(sglid, (uint)seq_len_calc_size - 1), 0); + // const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { - #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset) + #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#if IS_KV_COMPRESSED + #define KEY_UNPACKED_TYPE KEY_COMPRESSION_SCALE_TYPE + #define KEY_UNPACKED_VEC_TYPE MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define TO_KEY_UNPACKED_TYPE(val) TO_KEY_COMPRESSION_SCALE_TYPE(val) +#else + #define KEY_UNPACKED_TYPE INPUT1_TYPE + #define KEY_UNPACKED_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define TO_KEY_UNPACKED_TYPE(val) TO_INPUT1_TYPE(val) +#endif - QUERY_VEC queries_vec; + QUERY_VEC_TYPE queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) { queries_vec[q_row_idx] = slm_query[query_local_offset]; @@ -908,27 +1103,37 @@ KERNEL(sdpa_opt)( } #ifndef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - QUERY_VEC key_vec = 0; + KEY_UNPACKED_VEC_TYPE key_vec = 0; unroll_for (uint key_row_idx = 0; key_row_idx < seq_len_calc_size; key_row_idx++) { - #ifdef BEAM_TABLE_TYPE - key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); - #else - key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); - #endif +#ifdef BEAM_TABLE_TYPE + key_vec[key_row_idx] = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); +#else + key_vec[key_row_idx] = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + key_vec[key_row_idx] = (key_vec[key_row_idx] - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + key_vec[key_row_idx] *= sub_group_broadcast(comp_scale, key_row_idx); +#endif } #endif unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - #ifdef BEAM_TABLE_TYPE - INPUT1_TYPE key_vals = 0; - if (key_row_idx < seq_len_calc_size) - key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); - #else - INPUT1_TYPE key_vals = 0; - if (key_row_idx < seq_len_calc_size) - key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); - #endif + KEY_UNPACKED_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) { +#ifdef BEAM_TABLE_TYPE + key_vals = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); +#else + key_vals = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); +#endif + } +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + key_vals = (key_vals - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + key_vals *= sub_group_broadcast(comp_scale, key_row_idx); +#endif #else #define key_vals key_vec[key_row_idx] #endif @@ -941,12 +1146,14 @@ KERNEL(sdpa_opt)( { unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { +#if !APPLY_SCALES_TO_QUERY #if HAS_SCALE_INPUT const OUTPUT_TYPE scale_val = *scale; #else const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); #endif qk_acc[i] *= scale_val; +#endif #ifdef HAS_ALIBI const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i; @@ -1045,6 +1252,7 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE)]; const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len), head_size_idx); #else @@ -1058,12 +1266,28 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len + sglid]; } +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif +#endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } @@ -1093,12 +1317,21 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #else uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #endif #endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_val; @@ -1108,9 +1341,17 @@ KERNEL(sdpa_opt)( unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); @@ -1144,19 +1385,37 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE)]; const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start, head_size_idx); #else uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len_leftovers_start, head_size_idx); #endif #endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + min(seq_len_leftovers_start + sglid, seq_len_end - 1), 0); + // const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, seq_len_idx)) * sub_group_broadcast(comp_scale, seq_len_idx); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, seq_len_idx)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 83e3c7c7e9fef1..682af11777012f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -112,6 +112,15 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin #endif #define APPLY_SCALE_TO_QUERY 1 +#define HAS_KV_CACHE_ZP_INPUT USE_ASYMMETRIC_QUANTIZATION && !COMBINE_SCALES_AND_ZP + +#if IS_KV_COMPRESSED +#if COMPRESSED_PER_HEAD + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (f), (y), (0)); +#else + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (0), (y), (0)); +#endif +#endif KERNEL(sdpa_ref)( OPTIONAL_SHAPE_INFO_ARG @@ -125,6 +134,14 @@ KERNEL(sdpa_ref)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#if HAS_KV_CACHE_ZP_INPUT + const __global KEY_COMPRESSION_ZP_TYPE* key_zp, + const __global VALUE_COMPRESSION_ZP_TYPE* val_zp, +#endif +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -162,7 +179,24 @@ KERNEL(sdpa_ref)( #else INPUT0_TYPE q_val = query_input[query_offset]; #endif - INPUT1_TYPE k_val = key_input[key_offset]; + + INPUT1_TYPE k_val_packed = key_input[key_offset]; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; + +#if USE_ASYMMETRIC_QUANTIZATION && HAS_KV_CACHE_ZP_INPUT + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_zp[comp_offset]; +#elif USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#else + KEY_COMPRESSION_SCALE_TYPE comp_zp = 0; +#endif + KEY_COMPRESSION_SCALE_TYPE k_val = ((k_val_packed - comp_zp) * comp_scale); + +#else + INPUT1_TYPE k_val = k_val_packed; +#endif acc += q_val * k_val; } @@ -236,7 +270,24 @@ KERNEL(sdpa_ref)( #endif uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1, 0, 0, s, head_size_idx); - acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; + const INPUT2_TYPE value_packed = value_input[value_offset]; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; + +#if USE_ASYMMETRIC_QUANTIZATION && HAS_KV_CACHE_ZP_INPUT + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_zp[comp_offset]; +#elif USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#else + VALUE_COMPRESSION_SCALE_TYPE comp_zp = 0; +#endif + VALUE_COMPRESSION_SCALE_TYPE value = ((value_packed - comp_zp) * comp_scale); +#else + INPUT2_TYPE value = value_packed; +#endif + + acc += tmp_buf[tmp_buf_offset] * value; } uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h new file mode 100644 index 00000000000000..ac6870a37a1728 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelKVCache : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelKVCache() : KernelBaseOpenCL("dynamic_quantize_gpu_kv_cache") {} + virtual ~DynamicQuantizeKernelKVCache() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index 6a678770e85d72..b610ac2076def4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -154,6 +154,15 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) return false; + if (dq_params.append_axis != -1) + return false; + + if (dq_params.group_sizes.back() != UINT64_MAX) + return false; + + if (!dq_params.scales_output_order.empty()) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp new file mode 100644 index 00000000000000..d0c99484e3f52e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp @@ -0,0 +1,285 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_kv_cache.h" +#include "kernel_selector_utils.h" +#include + + +static constexpr size_t subgroup_size = 16; + +namespace kernel_selector { +static Tensor::NDims get_normalized_dims(const DataTensor& tensor) { + auto dims = tensor.GetDims(); + std::reverse(dims.begin(), dims.end()); + + return dims; +} + +static size_t get_elements_number_per_batch(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + size_t total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != UINT64_MAX) { + total_elements_number *= input_dims[i].v; + } + } + + return total_elements_number; +} + +static size_t get_elements_number_per_group(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + size_t total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == UINT64_MAX) { + total_elements_number *= input_dims[i].v; + } else { + total_elements_number *= group_sizes[i]; + } + } + + return total_elements_number; +} + +static std::string generate_dims_indexes_calculation(std::vector> dims) { + std::reverse(dims.begin(), dims.end()); // reorder dims in order from innermost to outermost dimensions + + auto generate_calc_function = [&](std::string data_type, std::string index_var, size_t dim_idx) { + std::string index_calc_str; + index_calc_str += "" + data_type + " " + dims[dim_idx].first + " = "; + index_calc_str += "(" + index_var + " / "; + index_calc_str += "(1"; + for (size_t i = 0; i < dim_idx; i++) { + index_calc_str += " * " + dims[i].second; + } + index_calc_str += ")) % " + dims[dim_idx].second + ";"; + + return index_calc_str; + }; + + std::stringstream indexes_calc_str; + for (size_t i = 0; i < dims.size(); i++) { + indexes_calc_str << generate_calc_function("uint", "data_idx", i); + } + + return indexes_calc_str.str(); +} + +static size_t get_per_iter_elements_number(const dynamic_quantize_params& params) { + const auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize; + const auto total_grouped_elements = get_elements_number_per_group(params); + + if (total_grouped_elements % maxWorkGroupSize == 0) + return maxWorkGroupSize; + + if (total_grouped_elements < maxWorkGroupSize) + return total_grouped_elements; + + return 0; +} + +ParamsKey DynamicQuantizeKernelKVCache::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const std::vector> default_dims = {{"b", "INPUT0_BATCH_NUM"}, + {"f", "INPUT0_FEATURE_NUM"}, + {"y", "INPUT0_SIZE_Y"}, + {"x", "INPUT0_SIZE_X"}}; + + const auto& group_sizes = params.group_sizes; + std::vector> batch_dims, grouped_dims; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == 1) { + batch_dims.push_back(default_dims[i]); + } else { + grouped_dims.push_back(default_dims[i]); + } + } + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto per_iter_elements_number = get_per_iter_elements_number(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + + // Drop the last dimensions, since it will be processed in the kernel's loop + grouped_dims.pop_back(); + + const bool append_mode = params.append_axis != -1; + std::pair append_axis_info = {}; + if (append_mode) { + jit.AddConstant(MakeJitConstant("APPEND_MODE", append_mode)); + jit.AddConstant(MakeJitConstant("APPEND_AXIS_NAME", default_dims[params.append_axis].first)); + } + + jit.AddConstant(MakeJitConstant("DECLARE_BATCHED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(batch_dims))); + jit.AddConstant(MakeJitConstant("DECLARE_GROUPED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(grouped_dims))); + jit.AddConstant(MakeJitConstant("SUBGROUPS_NUMBER", total_subgroups_number)); + + const auto iterations_number = total_grouped_elements / per_iter_elements_number; + + jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number)); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + + bool rearrange_scales_order = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales_order = true; + break; + } + } + } + + if (rearrange_scales_order) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + } + + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelKVCache::SetDefault(const dynamic_quantize_params& params) const { + CommonDispatchData dispatchData; + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_batched_elements = get_elements_number_per_batch(params); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + + dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements}; + dispatchData.lws = {subgroup_size, total_subgroups_number, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelKVCache::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + + if (prim_params.append_axis != -1) { + kd.kernels[0].params.scalars.clear(); + + ScalarDescriptor axis_offset; + axis_offset.t = ScalarDescriptor::Types::UINT32; + axis_offset.v.u32 = static_cast(prim_params.axis_offset); + kd.kernels[0].params.scalars.push_back(axis_offset); + } + }; +} + +KernelsData DynamicQuantizeKernelKVCache::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + if (prim_params.append_axis != -1) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelKVCache::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_3; +} + +bool DynamicQuantizeKernelKVCache::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + const auto& group_sizes = dq_params.group_sizes; + const auto& input_dims = get_normalized_dims(dq_params.inputs[0]); + const size_t non_compressed_dims_number = std::count(group_sizes.begin(), group_sizes.end(), 1); + + if (non_compressed_dims_number == group_sizes.size()) + return false; + + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != 1 && input_dims[i].is_dynamic) { + return false; + } + } + + // Last dimension should be static, reduced by group_sizes configuration and divisible by 16 + if (group_sizes.back() == 1 || input_dims.back().is_dynamic || input_dims.back().v % subgroup_size != 0) + return false; + + // Limit the size of the innermost dimension + if (input_dims.back().v > 256) + return false; + + // In case of HEADS_NUM * HEAD_SIZE group size, check that it fits into the supported workgroup size limit + if (get_elements_number_per_group(dq_params) / input_dims.back().v >= params.engineInfo.maxWorkGroupSize / subgroup_size) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 3b214848e2f8ad..b7a9b40191da4e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -26,6 +26,39 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + bool rearrange_scales = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales = true; + break; + } + } + } + + if (rearrange_scales) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + } + + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + + const auto& group_sizes = params.group_sizes; + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + return jit; } @@ -34,7 +67,14 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p CommonDispatchData dispatchData; OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + + const auto& group_sizes = params.group_sizes; + auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; + auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; + auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; + auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1; + + dispatchData.gws = {batch_size * feature_size, y_size, x_size}; dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -94,6 +134,10 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; + const auto& prim_params = static_cast(params); + if (prim_params.group_sizes.size() != 4) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h index c46b6b2685a940..d437d6ab6eb1f6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -13,6 +13,13 @@ namespace kernel_selector { struct dynamic_quantize_params : public base_params { dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} size_t fc_ifm_size = 0; + + int64_t append_axis = -1; + int64_t axis_offset = -1; + std::vector group_sizes; + std::vector scales_output_order; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; }; class DynamicQuantizeKernelRef : public KernelBaseOpenCL { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp index 6ca9fbd2f5bd76..d38cf6ad2b4e52 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -5,11 +5,13 @@ #include "dynamic_quantize_kernel_selector.h" #include "dynamic_quantize_kernel_ref.h" #include "dynamic_quantize_kernel_opt.h" +#include "dynamic_quantize_kernel_kv_cache.h" namespace kernel_selector { dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { Attach(); Attach(); + Attach(); } KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index 7556debd29df00..e2a538750d1615 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_base.h" #include "kernel_selector_utils.h" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace kernel_selector { @@ -73,6 +74,8 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(), params.conf.broadcast_axis, params.conf.group_size))); + } else { + jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", 1)); } jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal)); @@ -81,6 +84,21 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4)); } + jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed)); + + if (params.conf.is_kv_compressed) { + jit.AddConstant(MakeJitConstant("USE_ASYMMETRIC_QUANTIZATION", params.conf.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("COMBINE_SCALES_AND_ZP", params.conf.combine_scales_and_zp)); + jit.AddConstant(MakeJitConstant("COMPRESSED_PER_HEAD", params.conf.per_head_quantization)); + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale)); + + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) { + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_ZP", params.key_cache_comp_zp)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_ZP", params.value_cache_comp_zp)); + } + } + auto is_default_order = [](const std::vector& order) { for (size_t i = 0; i < order.size(); i++) if (order[i] != static_cast(i)) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 492e86ebcce5cc..493bd0acedea32 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -88,6 +88,10 @@ struct sdpa_configuration { bool is_causal = false; bool has_alibi_input = false; + bool is_kv_compressed = false; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; + bool per_head_quantization = false; // Paged Attention configuration bool is_paged_attention = false; @@ -110,6 +114,10 @@ struct sdpa_params : public base_params { int64_t indirect_axis = -1; DataTensor beam_table; + DataTensor key_cache_comp_scale; + DataTensor key_cache_comp_zp; + DataTensor value_cache_comp_scale; + DataTensor value_cache_comp_zp; sdpa_configuration conf; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 974d4532c84e60..a6fa66f4799d3f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -344,6 +344,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (params.conf.head_size > 256) return false; + if (params.conf.is_kv_compressed) + return false; + return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 6942e5f8ea4357..4e71064efbc895 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_opt.h" #include "kernel_selector_utils.h" +#include "common_types.h" #include #include @@ -21,11 +22,16 @@ constexpr size_t subgroup_size = 16; } // namespace static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t kernel_type) { + const size_t optimal_scale_factor = 2; if (kernel_type == KernelsTypes::MULTI_TOKENS) { - const size_t optimal_scale_factor = 2; if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) { return optimal_scale_factor; } + } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) { + if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize && + sdpa_params.conf.head_size * optimal_scale_factor / subgroup_size <= subgroup_size) { + return optimal_scale_factor; + } } return 1; @@ -126,6 +132,7 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const ParamsKey SDPAKernelOpt::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); k.EnableInputDataType(Datatype::INT32); @@ -154,6 +161,9 @@ bool SDPAKernelOpt::Validate(const Params& p) const { if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0) return false; + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) + return false; + return true; } @@ -233,10 +243,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1; if (kernel_idx == KernelsTypes::SINGLE_TOKEN) { + const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), - head_size * num_of_partitions }; - dispatch_data.lws = { 1, 1, head_size }; + head_size * num_of_partitions * sg_num_scale }; + dispatch_data.lws = { 1, 1, head_size * sg_num_scale }; } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) { const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, @@ -309,8 +320,20 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { static_cast(prim_params.outputs.size()), prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed && kernel_idx != KernelsTypes::FINALIZATION) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp index 579c4bc06c17e2..0d551883b6c385 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp @@ -12,6 +12,7 @@ namespace kernel_selector { ParamsKey SDPAKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F32); // beam table input k.EnableInputDataType(Datatype::INT32); @@ -74,8 +75,26 @@ KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const { "", false, false, static_cast(prim_params.inputs.size()), GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + if (prim_params.conf.use_asymmetric_quantization && !prim_params.conf.combine_scales_and_zp) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx + 2}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx + 2}); + beam_table_idx += 2; + } + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 7574b664b6b4b7..8173a29c1b35f8 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -152,5 +152,64 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { return m_hidden_states[1]; } +VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed( + const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + const std::vector& output_layouts, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state = false) + : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx), + m_has_zp_state(has_zp_state) { + OPENVINO_ASSERT((has_zp_state && output_layouts.size() == 3) || + (!has_zp_state && output_layouts.size() == 2), + "[GPU] Unexpected number of output layouts for VariableStateIndirectKVCacheCompressed"); + + const auto compression_scale_layout = output_layouts[1]; + VariableStateInfo compression_scale_state_info(info.m_id + "/comp_scale", compression_scale_layout); + m_hidden_states.push_back(std::make_shared(compression_scale_state_info, context, shape_predictor)); + + if (has_zp_state) { + const auto compression_zp_layout = output_layouts[2]; + VariableStateInfo compression_zp_state_info(info.m_id + "/comp_zp", compression_zp_layout); + m_hidden_states.push_back(std::make_shared(compression_zp_state_info, context, shape_predictor)); + } + + OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4), + "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, " + "actual number is ", m_hidden_states.size()); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const { + return m_hidden_states[2]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_scale_layout(const cldnn::layout& new_layout) { + m_hidden_states[2]->set_layout(new_layout); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_zp_state() const { + OPENVINO_ASSERT(m_has_zp_state); + return m_hidden_states[3]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_zp_layout(const cldnn::layout& new_layout) { + OPENVINO_ASSERT(m_has_zp_state); + m_hidden_states[3]->set_layout(new_layout); +} + +bool VariableStateIndirectKVCacheCompressed::has_zp_state() const { + return m_has_zp_state; +} + +void VariableStateIndirectKVCacheCompressed::set_state(const ov::SoPtr& state) { + OPENVINO_THROW("[GPU] set_state API is supported only when KV-cache compression is disabled"); +} + +ov::SoPtr VariableStateIndirectKVCacheCompressed::get_state() const { + OPENVINO_THROW("[GPU] get_state API is supported only when KV-cache compression is disabled"); +} + } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 0373251e45c051..85f28cbd711678 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -7,6 +7,7 @@ #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/dynamic_quantize.hpp" + namespace ov { namespace intel_gpu { @@ -15,15 +16,12 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_sizes(); - for (size_t i = 0; i < group_sizes.size() - 1; i++) - OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); - - OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); auto prim = cldnn::dynamic_quantize(primitive_name, - inputs[0], - op->get_group_sizes().back(), - get_output_data_types(op)); + inputs[0], + op->get_attrs()); + + prim.num_outputs = op->get_output_size(); + p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index c2ee336e48bf06..251c7346db9209 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/kv_cache_compressed.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/kv_cache.hpp" @@ -12,6 +13,7 @@ namespace ov { namespace op { namespace internal { using KVCache = ov::intel_gpu::op::KVCache; +using KVCacheCompressed = ov::intel_gpu::op::KVCacheCompressed; } // namespace internal } // namespace op } // namespace ov @@ -26,11 +28,31 @@ void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptrget_input_partial_shape(0).size(); auto prim = cldnn::kv_cache(layer_type_name_ID(op), - inputs, - op->get_variable()->get_info(), - ov::util::normalize(op->get_concat_axis(), rank), - ov::util::normalize(op->get_gather_axis(), rank), - op->get_indirect()); + inputs, + op->get_variable()->get_info(), + ov::util::normalize(op->get_concat_axis(), rank), + ov::util::normalize(op->get_gather_axis(), rank), + op->get_indirect()); + + prim.num_outputs = op->get_output_size(); + prim.output_data_types = get_output_data_types(op); + + p.add_primitive(*op, prim); +} + +void CreateKVCacheCompressedOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {4, 5}); + auto inputs = p.GetInputInfo(op); + int64_t rank = op->get_input_partial_shape(0).size(); + auto prim = cldnn::kv_cache(layer_type_name_ID(op), + inputs, + op->get_variable()->get_info(), + ov::util::normalize(op->get_concat_axis(), rank), + ov::util::normalize(op->get_gather_axis(), rank), + op->get_indirect()); + + prim.compressed = true; + prim.quantization_attributes = op->get_quantization_attrs(); prim.num_outputs = op->get_output_size(); prim.output_data_types = get_output_data_types(op); @@ -41,6 +63,7 @@ void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); auto layerName = layer_type_name_ID(op); bool is_causal = op->get_causal(); + const auto compression_inputs = op->get_compression_inputs_num(); + validate_inputs_count(op, {4 + compression_inputs, 5 + compression_inputs, 6 + compression_inputs}); + int64_t indirect_axis = op->get_indirect_axis(); auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, @@ -75,7 +77,9 @@ static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptrget_input0_transpose_order(), op->get_input1_transpose_order(), op->get_input2_transpose_order(), - op->get_output_transpose_order()); + op->get_output_transpose_order(), + op->get_quantization_attrs(), + op->get_kv_compressed()); p.add_primitive(*op, sdpa_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index d655e297e4a2c6..a4354c51092ac8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // + #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "openvino/core/type/element_type.hpp" @@ -9,6 +10,7 @@ #include "openvino/op/read_value.hpp" #include "transformations/rt_info/original_precision_attribute.hpp" #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "intel_gpu/primitives/assign.hpp" #include "intel_gpu/primitives/read_value.hpp" @@ -16,6 +18,7 @@ namespace ov { namespace op { namespace internal { using ReadValue = ov::intel_gpu::op::ReadValue; +using ReadValues = ov::intel_gpu::op::ReadValues; } // namespace internal } // namespace op } // namespace ov @@ -39,7 +42,7 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr CreateVariableAccessPrimitive(p, op, op->get_variable_id()); } +void CreateReadValuesOp(ProgramBuilder& p, const std::shared_ptr& op) { + std::vector variable_layouts; + for (size_t i = 0; i < op->get_output_size(); i++) { + const auto output_pshape = op->get_output_partial_shape(i); + const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(i)); + const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + variable_layouts.emplace_back(output_pshape, output_dtype, output_format); + } + + auto inputs = p.GetInputInfo(op); + auto user_specified_type = get_original_precision(op); + auto prim = cldnn::read_value{layer_type_name_ID(op), + inputs, + op->get_variable_id(), + variable_layouts, + user_specified_type}; + + p.add_primitive(*op, prim); +} + } // namespace REGISTER_FACTORY_IMPL(v3, Assign); @@ -89,6 +112,7 @@ REGISTER_FACTORY_IMPL(v6, Assign); REGISTER_FACTORY_IMPL(v3, ReadValue); REGISTER_FACTORY_IMPL(v6, ReadValue); REGISTER_FACTORY_IMPL(internal, ReadValue); +REGISTER_FACTORY_IMPL(internal, ReadValues); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 58e99e037fb931..26771117e2e786 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -8,6 +8,7 @@ #include "openvino/core/validation_util.hpp" #include "intel_gpu/primitives/kv_cache.hpp" +#include "intel_gpu/primitives/read_value.hpp" #include "intel_gpu/plugin/usm_host_tensor.hpp" #include "intel_gpu/plugin/sync_infer_request.hpp" #include "intel_gpu/plugin/remote_context.hpp" @@ -646,19 +647,40 @@ void SyncInferRequest::allocate_states() { bool indirect_kv_cache = false; int64_t beam_axis = 0; int64_t concat_axis = 0; + bool compressed = false; + bool has_zp_state = false; auto kv_cache_shape = vi.second.m_layout.get_partial_shape(); + std::vector states_layouts; for (auto& p : state_prims) { if (auto kv_cache_prim = dynamic_cast(p)) { indirect_kv_cache = kv_cache_prim->indirect; beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size()); concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size()); + compressed = kv_cache_prim->compressed; + has_zp_state = kv_cache_prim->get_compression_zp_inputs_num() > 0; + } else if (auto read_value = dynamic_cast(p)) { + states_layouts = read_value->output_layouts; } } - if (indirect_kv_cache) { - m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis)); + if (compressed) { + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor, + states_layouts, + beam_axis, + concat_axis, + has_zp_state)); + } else if (indirect_kv_cache) { + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor, + beam_axis, + concat_axis)); } else { - m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor)); + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor)); } } } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index eb16213bcb936c..68328160a98f82 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -61,7 +61,14 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size auto rank = m_fc->get_input_partial_shape(0).size(); std::vector shape_group_size(rank, 1); shape_group_size.back() = group_size; - auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + + ov::op::internal::DynamicQuantize::Attributes config; + config.quantization_dt = element::i8; + config.quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Symmetric; + config.scale_dt = element::f16; + config.group_sizes = shape_group_size; + + auto dyn_quan = std::make_shared(m_data, config); auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); auto output_type = m_fc->get_output_type(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp new file mode 100644 index 00000000000000..561822f9661109 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -0,0 +1,292 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "kv_cache_compression.hpp" + +#include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/kv_cache_compressed.hpp" +#include "intel_gpu/op/indirect_sdpa.hpp" +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" +#include "ov_ops/dynamic_quantize.hpp" + +#include "openvino/core/node_vector.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/sink.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/visualize_tree.hpp" +#include "transformations/utils/utils.hpp" + +#include + +namespace ov { +namespace intel_gpu { + +namespace { +std::vector get_variable_infos(const ov::op::util::VariableInfo& data_variable_info, + const ov::op::internal::DynamicQuantize::Attributes& quantization_attrs) { + std::vector infos; + + // Add initial data variable info + infos.push_back(data_variable_info); + + // Infer DQ shapes + ov::op::internal::DynamicQuantize dq; + dq.set_attrs(quantization_attrs); + + auto dq_shapes = ov::op::internal::DynamicQuantize::shape_infer(&dq, {data_variable_info.data_shape}); + + const auto variable_id = data_variable_info.variable_id; + const auto scale_shape = dq_shapes[1]; + const auto scale_dt = quantization_attrs.scale_dt; + + // Add scales variable info + infos.push_back(ov::op::util::VariableInfo{scale_shape, scale_dt, variable_id}); + + if (quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + // Add zero points variable info + const auto zp_dt = quantization_attrs.zp_dt; + infos.push_back(ov::op::util::VariableInfo{scale_shape, zp_dt, variable_id}); + } + + return infos; +} + +std::shared_ptr + update_past_read_value(std::shared_ptr past_rv_node, + const ov::op::internal::DynamicQuantize::Attributes& quantization_attrs) { + auto variable = past_rv_node->get_variable(); + variable->update_data_type(quantization_attrs.quantization_dt); + + auto variable_infos = get_variable_infos(past_rv_node->get_variable()->get_info(), quantization_attrs); + auto new_past_rv_node = std::make_shared(); + + if (past_rv_node->get_input_size() == 0) { + new_past_rv_node = std::make_shared(past_rv_node->get_variable(), variable_infos); + } else { + auto initializer_dq = std::make_shared(past_rv_node->get_input_node_shared_ptr(0), + quantization_attrs); + initializer_dq->set_friendly_name(past_rv_node->get_input_node_shared_ptr(0)->get_friendly_name() + "_dyn_quan"); + ov::copy_runtime_info(past_rv_node->get_input_node_shared_ptr(0), initializer_dq); + + OutputVector initializer_outputs = { initializer_dq->output(0), initializer_dq->output(1) }; + + if (quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + initializer_outputs.push_back(initializer_dq->output(2)); + + new_past_rv_node = std::make_shared(initializer_outputs, past_rv_node->get_variable(), variable_infos); + } + + ov::copy_runtime_info(past_rv_node, new_past_rv_node); + past_rv_node->output(0).replace(new_past_rv_node->output(0)); + + return new_past_rv_node; +} + +std::shared_ptr + update_kv_cache(std::shared_ptr past_rv_node, + std::shared_ptr kv_cache_node, + const ov::op::internal::DynamicQuantize::Attributes& quantization_attrs) { + OutputVector kv_cache_inputs = { past_rv_node->output(0), + kv_cache_node->get_input_node_shared_ptr(1), + kv_cache_node->get_input_node_shared_ptr(2), + past_rv_node->output(1) }; + + if (quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + kv_cache_inputs.push_back(past_rv_node->output(2)); + + auto new_kv_cache = std::make_shared(kv_cache_inputs, + kv_cache_node->get_variable(), + kv_cache_node->get_concat_axis(), + kv_cache_node->get_gather_axis(), + quantization_attrs); + + new_kv_cache->set_friendly_name(kv_cache_node->get_friendly_name()); + ov::copy_runtime_info(kv_cache_node, new_kv_cache); + + return new_kv_cache; +} +} // namespace + +class KVCacheCompressionMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("KVCacheCompressionMatcher", "0"); + KVCacheCompressionMatcher(ov::element::Type compression_dt); +}; + +KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compression_dt) { + using namespace ov::pass::pattern; + + if (compression_dt != element::i8) + return; + + const auto quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + const auto output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + + bool combine_scales_and_zp = output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + GPU_DEBUG_LOG << "KV-cache compression configuration: " + << "dt=" << compression_dt << ", " + << "asym=" << (quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric) << ", " + << "single_buffer_for_scales_and_zp=" << combine_scales_and_zp << "\n"; + + auto query = any_input(); + + auto key_past = wrap_type(); + auto key_new_token = any_input(); + auto key_beam_idx = any_input(); + auto key_cache = wrap_type({key_past, key_new_token, key_beam_idx}); + + auto value_past = wrap_type(); + auto value_new_token = any_input(); + auto value_beam_idx = any_input(); + auto value_cache = wrap_type({value_past, value_new_token, value_beam_idx}); + + auto input_attn_mask = any_input(); + auto input_scale = any_input(); + auto input_beam_table = any_input(); + + auto sdpa_without_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_beam_table }); + auto sdpa_with_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_attn_mask, input_beam_table }); + auto sdpa_with_attn_mask_and_scale_m = + wrap_type({ query, key_cache, value_cache, input_attn_mask, input_scale, input_beam_table }); + + auto sdpa = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + + const auto& pattern_map = m.get_pattern_value_map(); + + auto query_node = pattern_map.at(query).get_node_shared_ptr(); + + auto key_new_token_node = pattern_map.at(key_new_token).get_node_shared_ptr(); + auto key_cache_node = std::dynamic_pointer_cast(pattern_map.at(key_cache).get_node_shared_ptr()); + auto value_cache_node = std::dynamic_pointer_cast(pattern_map.at(value_cache).get_node_shared_ptr()); + auto sdpa_node = std::dynamic_pointer_cast(m.get_match_root()); + + auto key_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(key_past).get_node_shared_ptr()); + auto value_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(value_past).get_node_shared_ptr()); + + auto data_rank = key_cache_node->get_input_partial_shape(0).size(); + auto get_shape_group_sizes = [&](const std::vector& transposed_order) { + std::vector group_sizes(data_rank, 1); + std::vector order = transposed_order; + if (transposed_order.size() != data_rank) { + order.resize(data_rank); + std::iota(order.begin(), order.end(), 0); + } + + group_sizes[order[data_rank - 1]] = UINT64_MAX; + + return group_sizes; + }; + + // Reorder scales in static order: [batch, num_heads, seq_len, head_size] + auto get_scales_output_order = [&](const std::vector& transposed_order) { + std::vector scales_zp_output_order(data_rank); + scales_zp_output_order[0] = transposed_order[0]; + scales_zp_output_order[1] = transposed_order[1]; + scales_zp_output_order[2] = transposed_order[2]; + scales_zp_output_order[3] = transposed_order[3]; + + return scales_zp_output_order; + }; + + ov::op::internal::DynamicQuantize::Attributes config; + config.quantization_type = quantization_type; + config.group_sizes = get_shape_group_sizes(sdpa_node->get_input1_transpose_order()); + config.quantization_dt = element::i8; + config.scale_dt = query_node->get_output_element_type(0); + config.scales_zp_output_order = get_scales_output_order(sdpa_node->get_input1_transpose_order()); + config.output_storage_type = output_storage_type; + + if (config.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric) + config.zp_dt = query_node->get_output_element_type(0); + + key_past_rv_node = update_past_read_value(key_past_rv_node, config); + value_past_rv_node = update_past_read_value(value_past_rv_node, config); + + auto new_key_cache = update_kv_cache(key_past_rv_node, key_cache_node, config); + auto new_value_cache = update_kv_cache(value_past_rv_node, value_cache_node, config); + + OutputVector sdpa_inputs; + // Add Query, Key, Value, attention_mask, scale inputs + for (size_t i = 0; i < sdpa_node->get_input_size() - 1; i++) + sdpa_inputs.push_back(sdpa_node->get_input_node_shared_ptr(i)); + + // Replace Key and Value inputs with compressed ones + sdpa_inputs[1] = new_key_cache->output(0); + sdpa_inputs[2] = new_value_cache->output(0); + + // Add Key and Value compression scales + sdpa_inputs.push_back(new_key_cache->output(2)); + sdpa_inputs.push_back(new_value_cache->output(2)); + + // Add Key and Value compression zero points + if (config.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + config.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + sdpa_inputs.push_back(new_key_cache->output(3)); + sdpa_inputs.push_back(new_value_cache->output(3)); + } + + auto input0_transpose_order = sdpa_node->get_input0_transpose_order(); + auto input1_transpose_order = sdpa_node->get_input1_transpose_order(); + auto input2_transpose_order = sdpa_node->get_input2_transpose_order(); + auto output_transpose_order = sdpa_node->get_output_transpose_order(); + + auto new_sdpa = std::make_shared(sdpa_inputs, + new_key_cache->output(1), + sdpa_node->get_causal(), + sdpa_node->get_indirect_axis(), + input0_transpose_order, + input1_transpose_order, + input2_transpose_order, + output_transpose_order, + config, + sdpa_node->get_output_type()); + + new_key_cache->set_friendly_name(key_cache_node->get_friendly_name()); + ov::copy_runtime_info(key_cache_node, new_key_cache); + + new_value_cache->set_friendly_name(value_cache_node->get_friendly_name()); + ov::copy_runtime_info(value_cache_node, new_value_cache); + + new_sdpa->set_friendly_name(sdpa_node->get_friendly_name()); + ov::copy_runtime_info(sdpa_node, new_sdpa); + + ov::replace_node(sdpa_node, new_sdpa); + return true; + }; + + auto m = std::make_shared(sdpa, "KVCacheCompressionMatcher"); + this->register_matcher(m, callback); +} + +bool KVCacheCompression::run_on_model(const std::shared_ptr& m) { + return pass::GraphRewrite::run_on_model(m); +} + +KVCacheCompression::KVCacheCompression(ov::element::Type compression_dt) { + add_matcher(compression_dt); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp new file mode 100644 index 00000000000000..1587021a03ed36 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + + +/// Add dynamic quantization node and fuse it with KV cache operation +/// +/// ┌───────────┐ ┌─────────────┐ ┌───────────┐ ┌─────────────┐ +/// │ New Key │ │ New Value │ │ New Key │ │ New Value │ +/// └──────┬────┘ └──────┬──────┘ └──────┬────┘ └──────┬──────┘ +/// │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 +/// │ │ ==> │ │ +/// ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ +/// │ Query │ │ KV cache │ │ KV cache │ │ Query │ │ KV cache + DQ │ │ KV cache + DQ │ +/// | | | (Key) | (Value) | | | | (Key) | | (Value) | +/// └───┬─────┘ └────────┬─────────┘ └────────┬───────────┘ └────┬────┘ └────────┬─────────┘ └────────┬───────────┘ +/// │ │ │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 i8:data │ f16:scale i8:data │ f16:scale +/// │ │ │ │ │ │ +/// │ │ │ │ │ │ +/// │ ┌────┴───┐ │ │ ┌────┴───┐ │ +/// └─────────────┤ SDPA ├─────────────────┘ └─────────────┤ SDPA ├────────────────────┘ +/// └────────┘ └────────┘ + +class KVCacheCompression : public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("KVCacheCompression", "0"); + KVCacheCompression(ov::element::Type compression_dt); + + bool run_on_model(const std::shared_ptr& m) override; +}; + + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp index 681c88119efd95..73e916064a0c1c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp @@ -25,30 +25,63 @@ IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, validate_and_infer_types(); } +IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationAttribute& quantization_attribute, + const ov::element::Type output_type) + : ov::intel_gpu::op::SDPA(data_inputs, is_causal, order_q, order_k, order_v, order_out, quantization_attribute, output_type) + , m_indirect_axis(indirect_axis) { + auto beam_table_idx = data_inputs.size(); + set_argument(beam_table_idx, beam_table); + validate_and_infer_types(); +} + std::shared_ptr IndirectSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); // Exclude beam_table input OutputVector data_inputs(new_args.begin(), new_args.end() - 1); - return std::make_shared(data_inputs, - new_args.back(), - m_is_causal, - m_indirect_axis, - m_order_q, - m_order_k, - m_order_v, - m_order_out, - m_output_type); + if (m_compressed) { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_output_type); + } else { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_quantization_attrs, + m_output_type); + } } void IndirectSDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 4 || input_size == 5 || input_size == 6, + input_size >= 4 + compression_inputs && input_size <= 6 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 4, 5 or 6."); + ", expected 4, 5 or 6 data inputs and ", compression_inputs, " KV-cache compression related inputs"); + std::vector input_shapes; for (size_t i = 0; i < input_size - 1; i++) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index a598e556a8f05d..12d961be6d337a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/kv_cache_compressed.hpp" #include "gather_shape_inference.hpp" #include "concat_shape_inference.hpp" #include "openvino/core/partial_shape.hpp" @@ -13,19 +14,28 @@ namespace ov { namespace intel_gpu { namespace op { -KVCache::KVCache(const Output& past, - const Output& new_token_data, - const Output& beam_idx, +KVCache::KVCache(const OutputVector& inputs, const std::shared_ptr& past_variable, + bool indirect, int64_t concat_axis, int64_t gather_axis, const ov::element::Type output_type) - : Op({past, new_token_data, beam_idx}) + : Op(inputs) , m_concat_axis(concat_axis) , m_gather_axis(gather_axis) - , m_indirect(true) + , m_indirect(indirect) , m_output_type(output_type) { m_variable = past_variable; +} + +KVCache::KVCache(const Output& past, + const Output& new_token_data, + const Output& beam_idx, + const std::shared_ptr& past_variable, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type) + : KVCache({past, new_token_data, beam_idx}, past_variable, true, concat_axis, gather_axis, output_type) { if (m_indirect) set_output_size(2); validate_and_infer_types(); @@ -36,11 +46,7 @@ KVCache::KVCache(const Output& past, const std::shared_ptr& past_variable, int64_t concat_axis, const ov::element::Type output_type) - : Op({past, new_token_data}) - , m_concat_axis(concat_axis) - , m_gather_axis(0) - , m_indirect(false) - , m_output_type(output_type) { + : KVCache({past, new_token_data}, past_variable, false, concat_axis, 0, output_type) { m_variable = past_variable; validate_and_infer_types(); } @@ -54,14 +60,23 @@ bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { } void KVCache::validate_and_infer_types() { - auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + auto output_type = m_output_type; + if (m_output_type == ov::element::undefined) { + output_type = get_input_element_type(0); + } + std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; - if (get_output_size() == 2) + if (m_indirect) { input_shapes.push_back(get_input_partial_shape(2)); + } + auto shapes = shape_infer(this, input_shapes); - set_output_type(0, output_type, shapes[0]); + + size_t out_ports = 0; + set_output_type(out_ports++, output_type, shapes[0]); + if (m_indirect) { - set_output_type(1, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); } } @@ -85,13 +100,13 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new } } -std::vector shape_infer(const KVCache* op, std::vector input_shapes) { +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes) { std::vector out_shapes; out_shapes.resize(op->get_output_size()); const auto& gather_axis = op->get_gather_axis(); const auto& concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size()); - if (op->get_output_size() == 2) { + if (op->get_output_size() >= 2) { out_shapes[0] = input_shapes[0]; out_shapes[0][gather_axis] = input_shapes[2][0]; out_shapes[0][concat_axis] += input_shapes[1][concat_axis]; @@ -108,6 +123,87 @@ std::vector shape_infer(const KVCache* op, std::vector& past_variable, + int64_t concat_axis, + int64_t gather_axis, + const QuantizationAttrs& quantization_attrs, + const ov::element::Type output_type) + : KVCache(inputs, past_variable, true, concat_axis, gather_axis, output_type) + , m_compressed(true) + , m_quantization_attrs(quantization_attrs) { + OPENVINO_ASSERT(quantization_attrs.quantization_dt == ov::element::i8, + "[GPU] Only I8 data type is currently supported for KV-cache compression"); + + m_variable = past_variable; + size_t output_size = 3; + if (quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + output_size++; // add zp output + + set_output_size(output_size); + validate_and_infer_types(); +} + +void KVCacheCompressed::validate_and_infer_types() { + std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; + input_shapes.push_back(get_input_partial_shape(2)); + input_shapes.push_back(get_input_partial_shape(3)); + + if (m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + m_quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + input_shapes.push_back(get_input_partial_shape(4)); + + auto shapes = shape_infer(this, input_shapes); + + size_t out_ports = 0; + set_output_type(out_ports++, m_quantization_attrs.quantization_dt, shapes[0]); + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, m_quantization_attrs.scale_dt, shapes[2]); + + if (m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + m_quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + set_output_type(out_ports++, m_quantization_attrs.zp_dt, shapes[3]); + } +} + +std::shared_ptr KVCacheCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args, + m_variable, + m_concat_axis, + m_gather_axis, + m_quantization_attrs, + m_output_type); +} + +std::vector shape_infer(const KVCacheCompressed* op, + const std::vector& input_shapes) { + std::vector out_shapes = shape_infer(static_cast(op), input_shapes); + + if (op->get_output_size() >= 3) { + ov::op::internal::DynamicQuantize dq_op; + dq_op.set_attrs(op->get_quantization_attrs()); + + auto quantized_data_shapes = + ov::op::internal::DynamicQuantize::shape_infer(&dq_op, { input_shapes[1] }); + + const auto scales_concat_axis = 2; + ov::PartialShape compression_scale_shape = input_shapes[3]; + compression_scale_shape[scales_concat_axis] += quantized_data_shapes[1][scales_concat_axis]; + out_shapes[2] = compression_scale_shape; + + // add zp output + if (quantized_data_shapes.size() == 3) { + ov::PartialShape compression_zp_shape = input_shapes[4]; + compression_zp_shape[scales_concat_axis] += quantized_data_shapes[2][scales_concat_axis]; + out_shapes[3] = compression_zp_shape; + } + } + + return out_shapes; +} + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp index 5438a6e2e695b5..6cd7f778c71b3b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "openvino/core/partial_shape.hpp" namespace ov { @@ -28,16 +29,14 @@ bool ReadValue::visit_attributes(ov::AttributeVisitor& visitor) { return true; } -void ReadValue::validate_and_infer_types() { - OPENVINO_ASSERT(m_variable, "Variable is not initialized."); - const auto& variable_info = m_variable->get_info(); +void ReadValue::validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info) { const auto& variable_type = variable_info.data_type; const auto& variable_shape = variable_info.data_shape; // If no inputs provided, it means this ReadValue doesn't have initial subgraph. This is valid. - if (get_input_size() > 0) { - const auto& initial_type = get_input_element_type(0); - const auto& initial_shape = get_input_partial_shape(0); + if (get_input_size() > output_idx) { + const auto& initial_type = get_input_element_type(output_idx); + const auto& initial_shape = get_input_partial_shape(output_idx); // Variable's shape/type determine a permissible range of values for shape/type inferred from initial_subgraph. // If initial_subgraph is set, then we need to check that shape/type inferred from initial_subgraph @@ -64,19 +63,25 @@ void ReadValue::validate_and_infer_types() { // dynamic rank/type can be derived from the IRs generated via the prev versions of OV, // but dynamic rank/type are not supported in plugins, // so we are trying to fix them here using the rank/type of ReadValue 1st input, if it exists - if (get_input_size() > 0 && variable_info.data_shape.rank().is_dynamic() && - variable_info.data_type.is_dynamic()) { - set_output_type(0, initial_type, initial_shape); + if (variable_info.data_shape.rank().is_dynamic() && variable_info.data_type.is_dynamic()) { + set_output_type(output_idx, initial_type, initial_shape); return; } } - set_output_type(0, variable_type, variable_shape); + set_output_type(output_idx, variable_type, variable_shape); +} + +void ReadValue::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + const auto& variable_info = m_variable->get_info(); + + validate_and_infer_types(0, variable_info); } std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - switch (new_args.size()) { + switch (new_args.size()) { case 0: return std::make_shared(m_variable); case 1: @@ -89,6 +94,62 @@ std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& n } } +ReadValues::ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +ReadValues::ReadValues(const std::vector>& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable_initializers, variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +bool ReadValues::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("variable_id", m_variable); + + auto variable_info = m_variable->get_info(); + visitor.on_attribute("variable_type", variable_info.data_type); + visitor.on_attribute("variable_shape", variable_info.data_shape); + m_variable->update(variable_info); + return true; +} + +void ReadValues::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + + for (size_t i = 0; i < get_output_size(); i++) { + ReadValue::validate_and_infer_types(i, m_internal_states_infos[i]); + } +} + +std::shared_ptr ReadValues::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + OPENVINO_ASSERT(new_args.empty() || new_args.size() == m_internal_states_infos.size(), + "Unable to clone ReadValues op (name=", this->get_friendly_name(), "). ", + "Incorrect number of inputs. Expected: 0 or ", m_internal_states_infos.size(), ". ", + "Actual: ", new_args.size(), "."); + + if (new_args.size() > 0) { + return std::make_shared(new_args, m_variable, m_internal_states_infos); + } else { + return std::make_shared(m_variable, m_internal_states_infos); + } +} + +std::vector ReadValues::get_all_internal_states_info() const { + return m_internal_states_infos; +} + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp index 57d2899e2b2e77..09513d99153a1f 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -26,7 +26,29 @@ SDPA::SDPA(const OutputVector& inputs, , m_order_k(order_k) , m_order_v(order_v) , m_order_out(order_out) - , m_output_type(output_type) { + , m_output_type(output_type) + , m_compressed(false) { + set_arguments(inputs); + set_causal(is_causal); + validate_and_infer_types(); +} + +SDPA::SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationAttribute& quantization_attrs, + const ov::element::Type output_type) + : m_is_causal(is_causal) + , m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_output_type(output_type) + , m_compressed(true) + , m_quantization_attrs(quantization_attrs) { set_arguments(inputs); set_causal(is_causal); validate_and_infer_types(); @@ -46,11 +68,13 @@ std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& ne void SDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 3 || input_size == 4 || input_size == 5, + input_size >= 3 + compression_inputs && input_size <= 5 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 3, 4 or 5."); + ", expected 3, 4 or 5 data inputs and ", compression_inputs, " KV-cache compression related inputs"); std::vector input_shapes; for (size_t i = 0; i < input_size; i++) { @@ -77,6 +101,19 @@ bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) { return true; } +size_t SDPA::get_compression_inputs_num() const { + size_t compression_inputs = 0; + if (m_compressed) { + compression_inputs += 2; // 2 * scales + + if (m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && + m_quantization_attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) + compression_inputs += 2; // 2 * zp + } + + return compression_inputs; +} + std::vector shape_infer(const SDPA* op, std::vector input_shapes, const std::vector& order_q, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 4b72385663bf9d..770aa387da8a60 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -74,6 +74,7 @@ #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" +#include "plugin/transformations/kv_cache_compression.hpp" #include "plugin/transformations/convert_convolution.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" @@ -886,6 +887,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + + auto kv_cache_compression_dt = config.get_property(ov::hint::kv_cache_precision); + manager.register_pass(kv_cache_compression_dt); + manager.register_pass(); // This pass should be done after asymmetric quantization matching as it can move zp subtraction upper in the graph diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 5f943564d6f50e..5c3b3ee0c970f9 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -183,6 +183,7 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder."); message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing"); message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment"); + message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression"); message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, " "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*"); message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable " @@ -253,6 +254,7 @@ debug_configuration::debug_configuration() , disable_runtime_skip_reorder(0) , disable_primitive_fusing(0) , disable_fake_alignment(0) + , use_kv_cache_compression(-1) , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) , disable_horizontal_fc_fusion(0) { #ifdef GPU_DEBUG_CONFIG @@ -305,6 +307,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder); get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing); get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment); + get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression); get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size); get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion); std::string dump_iteration_str; diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 7661444cc4fd7b..c48f3f02fa9f6a 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -58,6 +58,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED), std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}), std::make_tuple(ov::hint::dynamic_quantization_group_size, 32), + std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), @@ -209,6 +210,24 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { else set_property(ov::hint::dynamic_quantization_group_size(debug_config->dynamic_quantize_group_size)); } + + int KVCacheCompression = 0; + if (const auto env_var = std::getenv("KVCacheCompression")) { + std::istringstream ss(env_var); + ss >> KVCacheCompression; + } + + if (KVCacheCompression == 1) { + set_property(ov::hint::kv_cache_precision(ov::element::i8)); + } + + GPU_DEBUG_IF(debug_config->use_kv_cache_compression != -1) { + GPU_DEBUG_IF(debug_config->use_kv_cache_compression == 1) { + set_property(ov::hint::kv_cache_precision(ov::element::i8)); + } else { + set_property(ov::hint::kv_cache_precision(ov::element::undefined)); + } + } } void ExecutionConfig::apply_hints(const cldnn::device_info& info) { diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp index 68098937bb9c0a..16db9d89c28b4d 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp @@ -26,6 +26,7 @@ struct Params { bool with_mask; bool with_scale; bool causal; + bool compressed; size_t batch; ov::element::Type model_element_type; size_t num_iter; @@ -46,6 +47,9 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI ov::AnyMap properties = {ov::hint::inference_precision(ov::element::f16), ov::intel_gpu::hint::enable_sdpa_optimization(true)}; + if (p.compressed) + properties.emplace(ov::hint::kv_cache_precision(ov::element::i8)); + const size_t n_heads = 16; const size_t n_features = 64; const size_t context_size = 7; @@ -58,6 +62,7 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI const bool causal = p.causal; const bool with_mask = p.with_mask; const bool with_scale = p.with_scale; + const bool compressed = p.compressed; auto model = tests::make_llm_kv_cache_sdpa_pattern(ov::Dimension::dynamic(), n_heads, @@ -284,14 +289,17 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI compare_tensors({ref_results[0]}, {sdpa_out}); } - auto variables = infer_request.query_state(); - std::vector states; - for (auto& variable : variables) { - auto state = variable.get_state(); - ASSERT_EQ(state.get_element_type(), element_type); - states.push_back(state); + if (!compressed) { + auto variables = infer_request.query_state(); + std::vector states; + for (auto& variable : variables) { + auto state = variable.get_state(); + ASSERT_EQ(state.get_element_type(), element_type); + states.push_back(state); + } + compare_tensors({ref_k_cache, ref_v_cache}, states); } - compare_tensors({ref_k_cache, ref_v_cache}, states); + infer_request.reset_state(); } } @@ -310,6 +318,7 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI result << "mask=" << p.with_mask << "_"; result << "scale=" << p.with_scale << "_"; result << "causal=" << p.causal << "_"; + result << "compressed=" << p.compressed << ""; return result.str(); } }; @@ -324,11 +333,17 @@ std::vector get_test_params() { const bool with_mask = true; const bool with_scale = true; const bool causal = true; + const bool compressed = true; + + p.push_back({with_rearrange, !with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + p.push_back({!with_rearrange, with_mask, !with_scale, !causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); - p.push_back({with_rearrange, !with_mask, !with_scale, !causal, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); - p.push_back({with_rearrange, with_mask, !with_scale, !causal, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); - p.push_back({with_rearrange, with_mask, !with_scale, !causal, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); - p.push_back({!with_rearrange, with_mask, !with_scale, !causal, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + // Compressed + p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); return p; } diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp index 4b24fb996b3f3f..105963d1b09d73 100644 --- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp @@ -197,7 +197,7 @@ TEST(stateful_model, check_dynamic_pad_for_kv_cache) { auto input_kv_lay = layout{info.data_shape, info.data_type, format::bfyx}; topology topology(input_layout("beam_idx", input_beam_idx_lay), input_layout("present", input_present_lay), - read_value("kv_cache", std::vector{}, info.variable_id, input_kv_lay), + read_value("kv_cache", std::vector{}, info.variable_id, {input_kv_lay}), gather("gather", input_info("kv_cache"), input_info("beam_idx"), @@ -224,7 +224,7 @@ TEST(stateful_model, check_dynamic_pad_for_kv_cache) { auto pad = tensor(0); pad.batch[0] = 1; - + { std::vector dynamic_pad_mask; const auto& dynamic_pad_dims = read_value_inst->get_output_layout(0).data_padding._dynamic_dims_mask; diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp index 194bc0244f86f0..2000d826ddfad6 100644 --- a/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp @@ -32,7 +32,7 @@ TEST_P(read_value_test, shape_infer) { auto& engine = get_test_engine(); - const auto variable_layout = p.input_layout; + const std::vector variable_layout = {p.input_layout}; auto input_layout_prim = std::make_shared("input", p.input_layout); auto inputs = std::vector{ input_info("input") }; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index c1686e359e91a0..5a78360eb1f6d8 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -22,35 +22,53 @@ using namespace cldnn; using namespace ::tests; +using QuantizationType = ov::op::internal::DynamicQuantize::QuantizationType; class dynamic_quantization_gpu_tests: public ::testing::Test { public: - void test_dynamic_quantization(bool is_caching_test, bool is_dynamic, int batch = 1, int ifm = 1024) { + void test_dynamic_quantization(bool is_caching_test, + const ov::PartialShape& input_shape, + const ov::Shape& data_shape, + const QuantizationType quantization_type = QuantizationType::Symmetric, + const std::string& impl_name = "") { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); - long int batch_num = batch; - long int ifm_num = ifm; - - bool is_4d = true; - - auto input_ps = is_4d ? ov::PartialShape{ batch_num, 1, 1, ifm_num } : ov::PartialShape{ batch_num, ifm_num}; - auto dyn_input_ps = is_4d ? ov::PartialShape{ -1, 1, 1, ifm_num } : ov::PartialShape{ -1, ifm_num}; + auto input_ps = data_shape; + auto dyn_input_ps = input_shape; + auto scales_ps = ov::PartialShape::dynamic(dyn_input_ps.size()); auto input_mem = engine.allocate_memory({ input_ps, data_types::f32, format::bfyx }); + auto group_sizes = std::vector(dyn_input_ps.size(), 1); + group_sizes.back() = UINT64_MAX; - auto input_data = rg.generate_random_1d(batch_num * ifm_num, -16.0f, 16.0f); + auto input_data = rg.generate_random_1d(ov::shape_size(data_shape), -16.0f, 16.0f); set_values(input_mem, input_data); - auto in_layout_f32 = is_dynamic ? layout{ dyn_input_ps, data_types::f32, format::bfyx } - : layout{ input_ps, data_types::f32, format::bfyx }; + auto in_layout_f32 = input_shape.is_dynamic() ? layout{ dyn_input_ps, data_types::f32, format::bfyx } + : layout{ input_ps, data_types::f32, format::bfyx }; + + auto in_layout = input_shape.is_dynamic() ? layout{ dyn_input_ps, data_types::f16, format::bfyx } + : layout{ input_ps, data_types::f16, format::bfyx }; - auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } - : layout{ input_ps, data_types::f16, format::bfyx }; + dynamic_quantize::Attributes dq_config; + dq_config.quantization_type = quantization_type; + dq_config.quantization_dt = data_types::i8; + dq_config.scale_dt = data_types::f16; + dq_config.zp_dt = data_types::undefined; + dq_config.group_sizes = group_sizes; + dq_config.scales_zp_output_order = { 0, 1, 2, 3 }; + dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + + if (quantization_type == QuantizationType::Asymmetric) { + dq_config.zp_dt = data_types::f16; + dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + } auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); - auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), 32, {data_types::f16, data_types::i8}); - auto reorder_2 = reorder("reorder_2", input_info("dyn_quan_prim"), layout{ input_ps, data_types::f16, format::bfyx }); + auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), dq_config); + auto reorder_data = reorder("reorder_data", input_info("dyn_quan_prim", 0), layout{ input_ps, data_types::f16, format::bfyx }); + auto reorder_scale = reorder("reorder_scale", input_info("dyn_quan_prim", 1), layout{ scales_ps, data_types::f16, format::bfyx }); // Implemented dynamic quantize kernel auto get_ref_results = [&]() { @@ -58,7 +76,8 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { input_layout("input", in_layout_f32), reorder_1, dyn_quan_prim, - reorder_2 + reorder_data, + reorder_scale ); auto config = get_test_default_config(engine); @@ -83,13 +102,18 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { input_layout("input", in_layout_f32), reorder_1, dyn_quan_prim, - reorder_2 + reorder_data ); auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); + if (impl_name != "") { + ov::intel_gpu::ImplementationDesc dyn_quan_impl_desc = { format::bfyx, impl_name, impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"dyn_quan_prim", dyn_quan_impl_desc} })); + } + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); network->set_input_data("input", input_mem); @@ -118,37 +142,69 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { }; TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { - this->test_dynamic_quantization(false, false, 2048, 4096); + this->test_dynamic_quantization(false, {11, 1, 1, 4096}, {2048, 1, 1, 4096}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size_dynamic) { - this->test_dynamic_quantization(false, true, 2048, 4096); + this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {2048, 1, 1, 4096}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size) { - this->test_dynamic_quantization(false, false, 64, 4096); + this->test_dynamic_quantization(false, {1, 1, 1, 4096}, {64, 1, 1, 4096}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch) { - this->test_dynamic_quantization(false, false, 1, 4096); + this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {1, 1, 1, 4096}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only) { - this->test_dynamic_quantization(false, false, 16, 33); + this->test_dynamic_quantization(false, {-1, 1, 1, 33}, {16, 1, 1, 33}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only_dynamic) { - this->test_dynamic_quantization(false, true, 16, 33); + this->test_dynamic_quantization(false, {1, 1, 1, 33}, {16, 1, 1, 33}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_invalid) { - this->test_dynamic_quantization(false, false, 16, 7); + this->test_dynamic_quantization(false, {-1, 1, 1, 7}, {16, 1, 1, 7}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned) { - this->test_dynamic_quantization(false, false, 16, 32); + this->test_dynamic_quantization(false, {-1, 1, 1, 32}, {16, 1, 1, 32}); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) { - this->test_dynamic_quantization(false, true, 16, 32); + this->test_dynamic_quantization(false, {1, 1, 1, 32}, {16, 1, 1, 32}); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache) { + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched) { + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered) { + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered) { + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym) { + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym) { + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym) { + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym) { + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp index 3bf1a771512ae8..59e31547602252 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp @@ -35,7 +35,7 @@ struct variable_test : public ::testing::TestWithParam> { topology topology; topology.add(input_layout("input", input_data->get_layout())); - topology.add(read_value{"read_value", { input_info("input") }, "v0", variable_layout}); + topology.add(read_value{"read_value", { input_info("input") }, "v0", { variable_layout }}); topology.add(eltwise{"sum", { input_info("input"), input_info("read_value") }, eltwise_mode::sum, {}, variable_layout.data_type}); topology.add(assign{"assign", { input_info("sum") }, "v0", variable_layout}); @@ -129,7 +129,7 @@ void test_exception_on_wrong_layout(bool is_caching_test) { topology topology; topology.add(input_layout("input", input_data->get_layout())); - topology.add(read_value{"read_value", { input_info("input") }, "v0", variable_layout}); + topology.add(read_value{"read_value", { input_info("input") }, "v0", { variable_layout }}); topology.add(input_layout("wrong_input", wrong_input_data->get_layout())); topology.add(assign{"assign", { input_info("wrong_input") }, "v0", wrong_layout}); @@ -218,14 +218,14 @@ void test_variables_are_preserved_across_inferences(bool is_caching_test) { topology.add(assign{"assign_2", { input_info("input_2") }, "v2", variable_layout}); topology.add(data("dummy1", dummy1)); - topology.add(read_value{"read_value_1", { input_info("dummy1") }, "v1", variable_layout}); - topology.add(read_value{"read_value_2", { input_info("dummy1") }, "v2", variable_layout}); + topology.add(read_value{"read_value_1", { input_info("dummy1") }, "v1", { variable_layout }}); + topology.add(read_value{"read_value_2", { input_info("dummy1") }, "v2", { variable_layout }}); topology.add(eltwise{"sum", { input_info("read_value_1"), input_info("read_value_2") }, eltwise_mode::sum, {}, variable_layout.data_type}); topology.add(assign{"assign_result", { input_info("sum") }, "v_result", variable_layout}); topology.add(data("dummy2", dummy2)); - topology.add(read_value{"read_result", { input_info("dummy2") }, "v_result", variable_layout}); + topology.add(read_value{"read_result", { input_info("dummy2") }, "v_result", { variable_layout }}); cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test); diff --git a/src/plugins/intel_gpu/tests/unit/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/tests/unit/transformations/kv_cache_compression.cpp new file mode 100644 index 00000000000000..67123f1d84cfe7 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/kv_cache_compression.cpp @@ -0,0 +1,344 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_test_utils.hpp" + +#include "openvino/core/model.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/op/abs.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/unsqueeze.hpp" + +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" +#include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/kv_cache_compressed.hpp" +#include "intel_gpu/op/indirect_sdpa.hpp" + +#include "plugin/transformations/kv_cache_compression.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, KVCacheCompression) { + bool causal = false; + bool with_mask = true; + bool with_scale = true; + size_t concat_axis = 2; + size_t gather_axis = 0; + ov::element::Type_t element_type = ov::element::f16; + std::vector qkv_order = {0, 1, 2, 3}; + std::shared_ptr mask = nullptr; + std::shared_ptr scale = nullptr; + ov::PartialShape input_shape = ov::PartialShape{1, 32, -1, 80}; + + { + auto query = std::make_shared(element_type, input_shape); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + + auto key_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v0"}); + auto key_current = std::make_shared(ov::element::f16, input_shape); + auto key_past = std::make_shared(key_variable); + auto key_cache = std::make_shared(key_past, key_current, beam_idx, key_variable, concat_axis, gather_axis); + + auto value_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v1"}); + auto value_current = std::make_shared(ov::element::f16, input_shape); + auto value_past = std::make_shared(value_variable); + auto value_cache = std::make_shared(value_past, value_current, beam_idx, value_variable, concat_axis, gather_axis); + + ov::ParameterVector params{ beam_idx, query, key_current, value_current }; + + if (with_mask) { + auto attn_mask = std::make_shared(element_type, ov::PartialShape::dynamic(4)); + mask = attn_mask; + params.push_back(attn_mask); + } + + if (with_mask && with_scale) { + auto scale_input = std::make_shared(element_type, ov::PartialShape{1}); + scale = scale_input; + params.push_back(scale_input); + } + + ov::OutputVector sdpa_inputs = { query, key_cache->output(0), value_cache->output(0) }; + + if (mask) { + sdpa_inputs.push_back(mask); + } + + if (scale) { + sdpa_inputs.push_back(scale); + } + + std::shared_ptr sdpa = nullptr; + sdpa = std::make_shared(sdpa_inputs, + key_cache->output(1), + causal, + gather_axis, + qkv_order, + qkv_order, + qkv_order, + ov::intel_gpu::op::SDPA::default_order(4)); + + auto result = std::make_shared(sdpa); + + ov::ResultVector results{ result }; + + model = std::make_shared(results, params); + manager.register_pass(ov::element::i8); + } + { + ov::op::internal::DynamicQuantize::Attributes dq_config; + dq_config.quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + dq_config.quantization_dt = ov::element::i8; + dq_config.scale_dt = ov::element::f16; + dq_config.zp_dt = ov::element::f16; + dq_config.group_sizes = { 1, 1, 1, UINT64_MAX }; + dq_config.scales_zp_output_order = { 0, 1, 2, 3 }; + dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + + auto query = std::make_shared(element_type, input_shape); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + + auto key_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v0"}); + auto key_current = std::make_shared(ov::element::f16, input_shape); + auto key_past_variable_infos = { ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::i8, "v0"}, + ov::op::util::VariableInfo{{1, 32, -1, 2}, ov::element::f16, "v0"} }; + auto key_past_compressed = std::make_shared(key_variable, key_past_variable_infos); + auto key_cache_inputs = ov::OutputVector{ key_past_compressed->output(0), key_current, beam_idx, key_past_compressed->output(1) }; + auto key_cache = std::make_shared(key_cache_inputs, + key_variable, + concat_axis, + gather_axis, + dq_config); + + auto value_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v1"}); + auto value_current = std::make_shared(ov::element::f16, input_shape); + auto value_past_variable_infos = { ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::i8, "v1"}, + ov::op::util::VariableInfo{{1, 32, -1, 2}, ov::element::f16, "v1"} }; + auto value_past_compressed = std::make_shared(value_variable, value_past_variable_infos); + auto value_cache_inputs = ov::OutputVector{ value_past_compressed->output(0), value_current, beam_idx, value_past_compressed->output(1) }; + auto value_cache = std::make_shared(value_cache_inputs, + value_variable, + concat_axis, + gather_axis, + dq_config); + + ov::ParameterVector params{ beam_idx, query, key_current, value_current }; + + if (with_mask) { + auto attn_input = std::make_shared(element_type, ov::PartialShape::dynamic(4)); + mask = attn_input; + params.push_back(attn_input); + } + + if (with_mask && with_scale) { + auto scale_input = std::make_shared(element_type, ov::PartialShape{1}); + scale = scale_input; + params.push_back(scale_input); + } + + ov::OutputVector sdpa_inputs = { query, key_cache->output(0), value_cache->output(0) }; + if (mask) { + sdpa_inputs.push_back(mask); + } + + if (scale) { + sdpa_inputs.push_back(scale); + } + + sdpa_inputs.push_back(key_cache->output(2)); + sdpa_inputs.push_back(value_cache->output(2)); + + std::shared_ptr sdpa = nullptr; + sdpa = std::make_shared(sdpa_inputs, + key_cache->output(1), + causal, + gather_axis, + qkv_order, + qkv_order, + qkv_order, + ov::intel_gpu::op::SDPA::default_order(4), + dq_config); + + auto result = std::make_shared(sdpa); + + ov::ResultVector results{ result }; + + model_ref = std::make_shared(results, params); + } +} + +TEST_F(TransformationTestsF, KVCacheCompressionWithInitializers) { + bool causal = false; + bool with_mask = true; + bool with_scale = true; + size_t concat_axis = 2; + size_t gather_axis = 0; + ov::element::Type_t element_type = ov::element::f16; + std::vector qkv_order = {0, 1, 2, 3}; + std::shared_ptr mask = nullptr; + std::shared_ptr scale = nullptr; + ov::PartialShape input_shape = ov::PartialShape{1, 32, -1, 80}; + + { + auto query = std::make_shared(element_type, input_shape); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + + auto key_variable_initializer = std::make_shared(ov::element::f16, input_shape); + auto key_current = std::make_shared(ov::element::f16, input_shape); + auto key_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v0"}); + auto key_past = std::make_shared(key_variable_initializer, key_variable); + auto key_cache = std::make_shared(key_past, key_current, beam_idx, key_variable, concat_axis, gather_axis); + + auto value_variable_initializer = std::make_shared(ov::element::f16, input_shape); + auto value_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v1"}); + auto value_current = std::make_shared(ov::element::f16, input_shape); + auto value_past = std::make_shared(value_variable_initializer, value_variable); + auto value_cache = std::make_shared(value_past, value_current, beam_idx, value_variable, concat_axis, gather_axis); + + ov::ParameterVector params{ beam_idx, query, key_current, value_current, key_variable_initializer, value_variable_initializer }; + + if (with_mask) { + auto attn_mask = std::make_shared(element_type, ov::PartialShape::dynamic(4)); + mask = attn_mask; + params.push_back(attn_mask); + } + + if (with_mask && with_scale) { + auto scale_input = std::make_shared(element_type, ov::PartialShape{1}); + scale = scale_input; + params.push_back(scale_input); + } + + ov::OutputVector sdpa_inputs = { query, key_cache->output(0), value_cache->output(0) }; + + if (mask) { + sdpa_inputs.push_back(mask); + } + + if (scale) { + sdpa_inputs.push_back(scale); + } + + std::shared_ptr sdpa = nullptr; + sdpa = std::make_shared(sdpa_inputs, + key_cache->output(1), + causal, + gather_axis, + qkv_order, + qkv_order, + qkv_order, + ov::intel_gpu::op::SDPA::default_order(4)); + + auto result = std::make_shared(sdpa); + + ov::ResultVector results{ result }; + + model = std::make_shared(results, params); + manager.register_pass(ov::element::i8); + } + { + ov::op::internal::DynamicQuantize::Attributes dq_config; + dq_config.quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; + dq_config.quantization_dt = ov::element::i8; + dq_config.scale_dt = ov::element::f16; + dq_config.zp_dt = ov::element::f16; + dq_config.group_sizes = { 1, 1, 1, UINT64_MAX }; + dq_config.scales_zp_output_order = { 0, 1, 2, 3 }; + dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + + auto query = std::make_shared(element_type, input_shape); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + + auto key_past_variable_infos = { ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::i8, "v0"}, + ov::op::util::VariableInfo{{1, 32, -1, 2}, ov::element::f16, "v0"} }; + auto key_current = std::make_shared(ov::element::f16, input_shape); + auto key_variable_initializer = std::make_shared(ov::element::f16, input_shape); + auto key_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v0"}); + + auto key_initializer_dq = + std::make_shared(key_variable_initializer, dq_config); + auto key_past_initializers = ov::OutputVector{ key_initializer_dq->output(0), key_initializer_dq->output(1) }; + auto key_past_compressed = std::make_shared(key_past_initializers, key_variable, key_past_variable_infos); + auto key_cache_inputs = ov::OutputVector{ key_past_compressed->output(0), key_current, beam_idx, key_past_compressed->output(1) }; + auto key_cache = std::make_shared(key_cache_inputs, + key_variable, + concat_axis, + gather_axis, + dq_config); + + auto value_past_variable_infos = { ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::i8, "v1"}, + ov::op::util::VariableInfo{{1, 32, -1, 2}, ov::element::f16, "v1"} }; + + auto value_current = std::make_shared(ov::element::f16, input_shape); + auto value_variable_initializer = std::make_shared(ov::element::f16, input_shape); + auto value_variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f16, "v1"}); + + auto value_initializer_dq = + std::make_shared(value_variable_initializer, dq_config); + auto value_past_initializers = ov::OutputVector{ value_initializer_dq->output(0), value_initializer_dq->output(1) }; + auto value_past_compressed = std::make_shared(value_past_initializers, value_variable, value_past_variable_infos); + auto value_cache_inputs = ov::OutputVector{ value_past_compressed->output(0), value_current, beam_idx, value_past_compressed->output(1) }; + auto value_cache = std::make_shared(value_cache_inputs, + value_variable, + concat_axis, + gather_axis, + dq_config); + + ov::ParameterVector params{ beam_idx, query, key_current, value_current, key_variable_initializer, value_variable_initializer }; + + if (with_mask) { + auto attn_input = std::make_shared(element_type, ov::PartialShape::dynamic(4)); + mask = attn_input; + params.push_back(attn_input); + } + + if (with_mask && with_scale) { + auto scale_input = std::make_shared(element_type, ov::PartialShape{1}); + scale = scale_input; + params.push_back(scale_input); + } + + ov::OutputVector sdpa_inputs = { query, key_cache->output(0), value_cache->output(0) }; + if (mask) { + sdpa_inputs.push_back(mask); + } + + if (scale) { + sdpa_inputs.push_back(scale); + } + + sdpa_inputs.push_back(key_cache->output(2)); + sdpa_inputs.push_back(value_cache->output(2)); + + std::shared_ptr sdpa = nullptr; + sdpa = std::make_shared(sdpa_inputs, + key_cache->output(1), + causal, + gather_axis, + qkv_order, + qkv_order, + qkv_order, + ov::intel_gpu::op::SDPA::default_order(4), + dq_config); + + auto result = std::make_shared(sdpa); + + ov::ResultVector results{ result }; + + model_ref = std::make_shared(results, params); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov From 1baf261e72a35959bfdbc3c8d67bf95f4f4c6dae Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Mon, 28 Oct 2024 17:17:06 +0100 Subject: [PATCH 070/233] [PT FE] Support aten::lerp and aten::lerp_ (#27272) ### Details: - *Support `aten::lerp` and `aten::lerp_`* ### Tickets: - *CVS-156191* --- src/frontends/pytorch/src/op/lerp.cpp | 36 +++++++++++++ src/frontends/pytorch/src/op_table.cpp | 2 + tests/layer_tests/pytorch_tests/test_lerp.py | 56 ++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 src/frontends/pytorch/src/op/lerp.cpp create mode 100644 tests/layer_tests/pytorch_tests/test_lerp.py diff --git a/src/frontends/pytorch/src/op/lerp.cpp b/src/frontends/pytorch/src/op/lerp.cpp new file mode 100644 index 00000000000000..67922da3e4578d --- /dev/null +++ b/src/frontends/pytorch/src/op/lerp.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/subtract.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_lerp(const NodeContext& context) { + // Tensor = aten::lerp(%lhs.1, %rhs.1, %self.weight) + num_inputs_check(context, 3, 3); + Output start; + Output end; + std::tie(start, end) = get_inputs_with_promoted_types(context, 0, 1); + + Output weight = context.get_input(2); + auto scale = context.mark_node(std::make_shared(end, start)); + weight = context.mark_node(std::make_shared(weight, scale)); + auto delta = context.mark_node(std::make_shared(scale, weight)); + return {context.mark_node(std::make_shared(start, delta))}; +}; + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 8e490a60ffa580..d0e388b5d08cf1 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -121,6 +121,7 @@ OP_CONVERTER(translate_inverse); OP_CONVERTER(translate_is_nonzero); OP_CONVERTER(translate_layer_norm); OP_CONVERTER(translate_len); +OP_CONVERTER(translate_lerp); OP_CONVERTER(translate_linalg_cross); OP_CONVERTER(translate_linalg_norm); OP_CONVERTER(translate_linalg_matrix_norm); @@ -509,6 +510,7 @@ const std::unordered_map get_supported_ops_ts() { {"aten::le", op::translate_1to1_match_2_inputs_align_types}, {"aten::leaky_relu", op::translate_1to1_match_2_inputs}, {"aten::len", op::translate_len}, + {"aten::lerp", op::translate_lerp}, // lift op is torchscript specific op responsible for tensors coping with guarantee of new memory allocation {"aten::lift", op::skip_node}, {"aten::lift_fresh", op::skip_node}, diff --git a/tests/layer_tests/pytorch_tests/test_lerp.py b/tests/layer_tests/pytorch_tests/test_lerp.py new file mode 100644 index 00000000000000..0f85fac8569c95 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_lerp.py @@ -0,0 +1,56 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import torch + +from pytorch_layer_test_class import PytorchLayerTest, skip_if_export + + +class TestLerp(PytorchLayerTest): + def _prepare_input(self): + return (np.random.randn(2, 5, 3, 4).astype(np.float32), self.input_rhs) + + def create_model(self, weight, op_type): + class aten_lerp(torch.nn.Module): + def __init__(self, weight, op) -> None: + super().__init__() + self.weight = weight + self.forward = self.forward1 if op == "lerp" else self.forward2 + + def forward1(self, lhs, rhs): + return torch.lerp(lhs, rhs, weight=self.weight) + + def forward2(self, lhs, rhs): + return lhs.lerp_(rhs, weight=self.weight) + + return aten_lerp(weight, op_type), None, f"aten::{op_type}" + + @pytest.mark.parametrize("weight", (-0.5, + 0, + 0.5, + 1, + 2, + skip_if_export([1, 5, 3, 4])) + ) + @pytest.mark.parametrize("input_shape_rhs", [[2, 5, 3, 4], + [1, 5, 3, 4], + [1]]) + @pytest.mark.parametrize("op_type", ["lerp", "lerp_"]) + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend + def test_lerp(self, ie_device, precision, ir_version, + weight, input_shape_rhs, op_type): + self.input_rhs = np.random.randn(*input_shape_rhs).astype(np.float32) + if isinstance(weight, list): + weight = torch.rand(weight) + self._test( + *self.create_model(weight, op_type), + ie_device, + precision, + ir_version, + use_convert_model=True, + ) From 0c558f7ad319fc1e9d7ed0b6b6d22f34f3292046 Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Mon, 28 Oct 2024 17:29:34 +0100 Subject: [PATCH 071/233] [STFT][Op][Spec] Update STFT-15 spec to describe 1D signal input support (#27278) ### Details: - Update STFT-15 spec to describe 1D signal input support - Add more representative examples - Fix "transpose_frames" attr name typo ### Tickets: - 155996 Related PR for implementation: - https://github.com/openvinotoolkit/openvino/pull/27274 --- .../operation-specs/signals/stft-15.rst | 112 +++++++++++++++--- 1 file changed, 97 insertions(+), 15 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst index 4a41df7214317c..581c5062f67520 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst @@ -14,14 +14,14 @@ Short Time Fourier Transformation for real-valued input (STFT) **Short description**: *STFT* operation performs Short-Time Fourier Transform (real-to-complex). -**Detailed description**: *STFT* performs Short-Time Fourier Transform of real-valued batched input tensor of shape ``[batch, signal_size]``, and produces complex result represented by separate values for real and imaginary part. +**Detailed description**: *STFT* performs Short-Time Fourier Transform of real-valued input tensor of shape ``[signal_size]`` or ``[batch, signal_size]``, and produces complex result represented by separate values for real and imaginary part. **Attributes**: -* *transform_frames* +* *transpose_frames* - * **Description**: Flag to set output shape layout. If true the ``frames`` dimension is at out_shape[2], otherwise it is at out_shape[1]. + * **Description**: Flag to set output shape layout. If true the ``frames`` dimension is at out_shape[-2], otherwise it is at out_shape[-3]. * **Range of values**: * ``false`` - do not transpose output shape @@ -31,7 +31,7 @@ Short Time Fourier Transformation for real-valued input (STFT) **Inputs** -* **1**: ``signal`` - Tensor of type *T* and 2D shape [batch, signal_size] with signal data for the STFT. **Required.** +* **1**: ``signal`` - Tensor of type *T* and 1D shape [signal_size] or 2D shape [batch, signal_size] with signal data for the STFT. **Required.** * **2**: ``window`` - Tensor of type *T* and 1D shape [window_length], specifying the window values for the signal slice multiplication. **Required.** * **3**: ``frame_size`` - Scalar tensor of type *T_INT* describing the size of a single frame of the signal to be provided as input to FFT. **Required.** * **4**: ``frame_step`` - Scalar tensor of type *T_INT* describing The distance (number of samples) between successive frames. **Required.** @@ -41,13 +41,13 @@ Short Time Fourier Transformation for real-valued input (STFT) * **1**: The result of STFT operation, tensor of the same type as input ``signal`` tensor and shape: - + When ``transform_frames == false`` the output shape is ``[batch, frames, fft_results, 2]`` - + When ``transform_frames == true`` the output shape is ``[batch, fft_results, frames, 2]`` + + When ``transpose_frames == false`` the output shape is ``[frames, fft_results, 2]`` for 1D signal input or [batch, frames, fft_results, 2] for 2D signal input. + + When ``transpose_frames == true`` the output shape is [fft_results, frames, 2] for 1D signal input or [batch, fft_results, frames, 2]`` for 2D signal input. where: + ``batch`` is a batch size dimension - + ``frames`` is a number calculated as ``(signal_shape[1] - frame_size) / frame_step) + 1`` + + ``frames`` is a number calculated as ``(signal_shape[-1] - frame_size) / frame_step) + 1`` + ``fft_results`` is a number calculated as ``(frame_size / 2) + 1`` + ``2`` is the last dimension is for complex value real and imaginary part @@ -59,27 +59,109 @@ Short Time Fourier Transformation for real-valued input (STFT) * *T_INT*: ``int64`` or ``int32``. -**Example**: +**Examples**: + +*Example 1D signal, transpose_frames=false: * .. code-block:: xml :force: + + 56 + + + 7 + + + + + + 16 + 6 2 - 48 + + + + + +*Example 1D signal, transpose_frames=true: * + +.. code-block:: xml + :force: + + + + + + 56 - 8 + 7 - - + + - + + 6 + 16 2 - 9 - 9 + + + + +*Example 2D signal, transpose_frames=false: * + +.. code-block:: xml + :force: + + + + + + 3 + 56 + + + 7 + + + + + + 3 + 16 + 6 + 2 + + + + + +*Example 2D signal, transpose_frames=true: * + +.. code-block:: xml + :force: + + + + + + 3 + 56 + + + 7 + + + + + + 3 + 6 + 16 2 From b2a9527bac12563e49b01a745daaa27e007c133b Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 17:49:42 +0100 Subject: [PATCH 072/233] Add commit signoff policy readme (#27282) --- docs/dev/ci/commit_signoff_policy.md | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 docs/dev/ci/commit_signoff_policy.md diff --git a/docs/dev/ci/commit_signoff_policy.md b/docs/dev/ci/commit_signoff_policy.md new file mode 100644 index 00000000000000..ec6ec446286f58 --- /dev/null +++ b/docs/dev/ci/commit_signoff_policy.md @@ -0,0 +1,72 @@ +# How to sign-off commits + +We require a sign-off commit message in the following format on each commit in pull request. + +``` +This is a commit message. + +Signed-off-by: Author Name +``` + +## How to sign-off new commits + +In a local Git environment, the sign-off message can be added to a commit either manually (as a text) +or via the **-s** flag used with the “git commit” command, for example: + +`git commit -s -m "My commit message"` + +To avoid manually adding the flag for each commit, we recommend setting up a Git hook with the following steps: + +1. Navigate to the `/.git/hooks` folder. +2. Open the `prepare-commit-msg.sample` file and paste the following content: + +``` +COMMIT_MSG_FILE=$1 +COMMIT_SOURCE=$2 +SHA1=$3 + +NAME=$(git config user.name) +EMAIL=$(git config user.email) + +if [ -z "$NAME" ]; then + echo "empty git config user.name" + exit 1 +fi + +if [ -z "$EMAIL" ]; then + echo "empty git config user.email" + exit 1 +fi + +git interpret-trailers --if-exists doNothing --trailer \ + "Signed-off-by: $NAME <$EMAIL>" \ + --in-place "$1" +``` + +3. Save the file with the name `prepare-commit-msg` (remove the .sample extension). +4. Make the file executable (on Linux: `chmod +x /.git/hooks/prepare-commit-msg`). + +**Note**: For both sign-off approaches, ensure your user name and email address are configured in Git first: + +``` +git config user.name 'FIRST_NAME LAST_NAME' +git config user.email 'MY_EMAIL@example.com' +``` + +### Sign-off web-based commits + +To enable automatic sign-off of commits made via GitHub web interface, make sure that +[Require contributors to sign off on web-based commits](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/managing-repository-settings/managing-the-commit-signoff-policy-for-your-repository#enabling-or-disabling-compulsory-commit-signoffs-for-your-repository) +setting is selected in the Settings menu of your OpenVINO repository fork. + +## How to sign-off older commits in the history + +If you forget to add the sign-off to your last commit, you can amend it and force-push to GitHub: + +``` +git commit --amend --signoff +``` + +To sign off on even older commits, use an interactive rebase, edit unsigned commits, and execute +`git commit --amend --signoff` for each. However, please note that if others have already started working based on +the commits in this branch, this will rewrite history and may cause issues for collaborators. From 65313fcff4c19c8d8abb095f8300c3e568800c5f Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 19:25:59 +0100 Subject: [PATCH 073/233] Minor updates for commit_signoff_policy.md --- docs/dev/ci/commit_signoff_policy.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/dev/ci/commit_signoff_policy.md b/docs/dev/ci/commit_signoff_policy.md index ec6ec446286f58..0328d3c3ec308c 100644 --- a/docs/dev/ci/commit_signoff_policy.md +++ b/docs/dev/ci/commit_signoff_policy.md @@ -21,6 +21,8 @@ To avoid manually adding the flag for each commit, we recommend setting up a Git 2. Open the `prepare-commit-msg.sample` file and paste the following content: ``` +#!/bin/sh + COMMIT_MSG_FILE=$1 COMMIT_SOURCE=$2 SHA1=$3 @@ -44,7 +46,7 @@ git interpret-trailers --if-exists doNothing --trailer \ ``` 3. Save the file with the name `prepare-commit-msg` (remove the .sample extension). -4. Make the file executable (on Linux: `chmod +x /.git/hooks/prepare-commit-msg`). +4. Make the file executable (on Linux / Git Bash: `chmod +x /.git/hooks/prepare-commit-msg`). **Note**: For both sign-off approaches, ensure your user name and email address are configured in Git first: From c158480ae3a94f9a1f4a711d06224bce934876ed Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 20:45:48 +0100 Subject: [PATCH 074/233] [GHA] Use action_path in default smart ci config path (#27287) For easier usage in 3rd-party repos Signed-off-by: Alina Kladieva --- .github/actions/smart-ci/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/smart-ci/action.yml b/.github/actions/smart-ci/action.yml index cd111d617ddc1b..4d772c8f0eeb03 100644 --- a/.github/actions/smart-ci/action.yml +++ b/.github/actions/smart-ci/action.yml @@ -30,7 +30,6 @@ inputs: components_config_schema: description: "Path to the schema file for components configuration" required: false - default: ".github/actions/smart-ci/components_schema.yml" labeler_config: description: "Path to labeler configuration file" required: false @@ -101,7 +100,7 @@ runs: -f "${{ inputs.ref_name }}" \ -p "${{ inputs.component_pattern }}" \ -c "${{ inputs.components_config }}" \ - -m "${{ inputs.components_config_schema }}" \ + -m "${{ inputs.components_config_schema || env.DEFAULT_CONFIG_SCHEMA }}" \ -l "${{ inputs.labeler_config }}" \ --enable_for_org "${{ inputs.enable_for_org }}" \ --skip-when-only-listed-labels-set "${{ inputs.skip_when_only_listed_labels_set }}" \ @@ -109,3 +108,4 @@ runs: shell: bash env: GITHUB_TOKEN: ${{ inputs.repo_token }} + DEFAULT_CONFIG_SCHEMA: "${{ github.action_path }}/components_schema.yml" From e07546d6414768c7fea3769eef40bcb544e9352a Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 29 Oct 2024 12:29:00 +0400 Subject: [PATCH 075/233] [TF FE] Deduce Switch-Merge predicate shape (#27277) **Details:** It helps to convert some TF models out-of-the-box with static rank tensors that are required by plugins for inference. **Ticket:** 156204 --------- Signed-off-by: Kazantsev, Roman --- .../reverse_shape_and_type_infer.cpp | 9 ++++ .../transformations/switch_merge_resolve.cpp | 3 ++ .../tensorflow_tests/test_tf_SwitchMerge.py | 45 +++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp b/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp index 211f351da34024..9a06201f688675 100644 --- a/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp @@ -282,6 +282,15 @@ bool ov::pass::ReverseShapeAndTypeInfer::run_on_model(const std::shared_ptrget_input_tensor(0).m_element_type = element::boolean; is_changed = true; } + + // in case TensorFlow models, we can deduce predicate shape that must be a scalar + // If operations created by fusing Switch-Merge sub-graph contain tf_switch_merge_if rt-info + if (if_op->get_rt_info().count("tf_switch_merge_if") && + if_op->get_rt_info()["tf_switch_merge_if"].as() && + if_op->input_value(0).get_partial_shape().rank().is_dynamic()) { + if_op->get_input_tensor(0).m_partial_shape = ov::PartialShape({}); + is_changed = true; + } } else if (ov::as_type_ptr(op)) { is_changed |= inherit_output_shape(op, {0}); is_changed |= inherit_output_type(op, {1}); diff --git a/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp b/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp index 34b2a82152ccfc..cbdc506671aa67 100644 --- a/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp +++ b/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp @@ -235,6 +235,9 @@ bool pass::SwitchMergeResolver::run_on_model(const shared_ptr& m) { auto else_body = make_shared(else_results, else_params); auto if_op = make_shared(cond); + // in case TensorFlow models, we can deduce predicate shape that must be a scalar + if_op->get_rt_info()["tf_switch_merge_if"] = true; + set_cf_marker(if_cf_marker, if_op); if_op->set_then_body(then_body); if_op->set_else_body(else_body); diff --git a/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py b/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py index 96b73dd2134575..3747ab7a726aec 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py @@ -63,3 +63,48 @@ def test_merge_eliminating_several_cond_flows(self, params, cond_value, x_type, self._test(*self.merge_eliminating_several_cond_flows_net(**params, cond_value=cond_value, x_type=x_type), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) + + +class TestSwitchMergeWithVariablePredicate(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + assert 'x:0' in inputs_info + x_shape = inputs_info['x:0'] + inputs_data = {} + rng = np.random.default_rng() + inputs_data['x:0'] = rng.integers(-10, 10, x_shape).astype(np.float32) + inputs_data['cond:0'] = np.array(self.cond_value, dtype=bool) + return inputs_data + + def switch_merge_with_variable_predicate_net(self, x_shape, cond_shape, cond_value): + self.cond_value = cond_value + tf.compat.v1.reset_default_graph() + # Create the graph and model + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, x_shape, 'x') + cond = tf.compat.v1.placeholder(tf.bool, cond_shape, 'cond') + const_add = tf.constant(3, dtype=tf.float32) + const_sub = tf.constant(1, dtype=tf.float32) + switch_false, switch_true = tf.raw_ops.Switch(data=x, pred=cond) + add = tf.raw_ops.AddV2(x=switch_false, y=const_add) + sub = tf.raw_ops.Sub(x=switch_true, y=const_sub) + merge = tf.raw_ops.Merge(inputs=[add, sub]) + const_main = tf.constant(1, dtype=tf.float32) + tf.raw_ops.AddV2(x=merge[0], y=const_main, name='add_res') + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + @pytest.mark.parametrize('x_shape', [[], [2], [3, 2]]) + @pytest.mark.parametrize('cond_shape', [None, []]) + @pytest.mark.parametrize('cond_value', [True, False]) + @pytest.mark.precommit + @pytest.mark.nightly + def test_switch_merge_with_variable_predicate(self, x_shape, cond_shape, cond_value, + ie_device, precision, ir_version, temp_dir, + use_legacy_frontend): + if ie_device == 'GPU': + pytest.skip("156244: accuracy error on GPU") + self._test(*self.switch_merge_with_variable_predicate_net(x_shape, cond_shape, cond_value), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) From 744475b46127c74531d0271218ecaf1c5ff5b702 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Tue, 29 Oct 2024 09:51:57 +0100 Subject: [PATCH 076/233] [SPEC][Op] Align SearchSorted specification with core (#27275) ### Details: - *Align SearchSorted tensor/attribute names with ones used in core* - *Minor improvements for descriptions* - *Fix SliceScatter missing opset number* ### Tickets: - *ticket-id* --- .../operation-sets/operation-specs.rst | 2 +- .../operation-specs/sort/search-sorted-15.rst | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst index 6ecbf2695699f9..8eccea47c31dd0 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst @@ -215,7 +215,7 @@ Operation Specifications Sin-1 Sinh-1 Slice-8 - SliceScatter + SliceScatter-15 SoftMax-1 SoftMax-8 SoftPlus-4 diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst index 81c592d3341a35..7a623a1e16739c 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst @@ -17,29 +17,32 @@ SearchSorted **Attributes** -* *right* +* *right_mode* - * **Description**: If False, set the first suitable index. If True, return the last suitable index for given value. Default is False. - * **Range of values**: true or false - * **Type**: boolean + * **Description**: flag to control whether output would contain leftmost or rightmost indices for given values. + * **Range of values**: + + * *true* - return the rightmost (last) suitable index for given value. + * *false* - return the leftmost (first) suitable index for given value. + * **Type**: ``boolean`` * **Default value**: false * **Required**: *no* **Inputs**: -* **1**: ``sorted`` - ND input tensor of type *T* - cannot be a scalar, containing monotonically increasing sequence on the innermost dimension. **Required.** +* **1**: ``sorted_sequence`` - ND input tensor of type *T* - cannot be a scalar, containing monotonically increasing sequence on the innermost dimension. **Required.** * **2**: ``values`` - ND input tensor of type *T*, containing the search values. If sorted sequence is 1D, then the values can have any shape, otherwise the rank should be equal to the rank of sorted input. **Required.** **Outputs**: -* **1**: Tensor of type *TOut*, with the same shape as second input tensor, containing the indices. +* **1**: Tensor of type *T_IND*, with the same shape as second input tensor ``values``, containing the indices. **Types** * *T*: any supported floating-point and integer type. -* *TOut*: int64. +* *T_IND*: ``int64``. **Example** @@ -47,7 +50,7 @@ SearchSorted :force: - + 7 @@ -63,7 +66,7 @@ SearchSorted - + 7 256 200 From dd7967cf19b0241d5504f02bbf504533f0fc8326 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Tue, 29 Oct 2024 15:44:40 +0400 Subject: [PATCH 077/233] [CPU][OMP] Safe usage of threads num with buffers (#27237) ### Details: - *Function `parallel_get_max_threads()` may returns ether core number or threads number. That can affect buffers access which size depends on threads number.* - *...* ### Tickets: - *152606* --------- Co-authored-by: Ilya Lavrenov --- src/plugins/intel_cpu/src/nodes/ctc_loss.cpp | 5 ++- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 17 +++++--- src/plugins/intel_cpu/src/nodes/gather.cpp | 12 +++--- src/plugins/intel_cpu/src/nodes/gather.h | 1 + .../intel_cpu/src/nodes/grid_sample.cpp | 12 +++--- .../intel_cpu/src/nodes/grid_sample.hpp | 2 +- .../kernels/scaled_attn/mha_single_token.cpp | 6 ++- src/plugins/intel_cpu/src/nodes/llm_mlp.cpp | 37 ++++++++++------ src/plugins/intel_cpu/src/nodes/mha.cpp | 24 ++++++----- src/plugins/intel_cpu/src/nodes/mha.h | 2 + src/plugins/intel_cpu/src/nodes/mvn.cpp | 43 +++++++++++++------ src/plugins/intel_cpu/src/nodes/qkv_proj.cpp | 11 ++--- src/plugins/intel_cpu/src/nodes/reduce.cpp | 8 +++- src/plugins/intel_cpu/src/nodes/roi_align.cpp | 6 ++- .../intel_cpu/src/nodes/scaled_attn.cpp | 27 ++++++++---- .../intel_cpu/src/nodes/strided_slice.cpp | 7 ++- .../intel_cpu/src/nodes/strided_slice.h | 1 + 17 files changed, 142 insertions(+), 79 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp index 78bb6fc0563e60..3161c9a0e87a84 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp @@ -84,7 +84,8 @@ void CTCLoss::execute(dnnl::stream strm) { std::vector decodedTargetLenB(batchNum, 0); std::vector> targetDB(batchNum); std::vector>> logProbabilitiesB(batchNum); - std::vector errorMsgB(parallel_get_max_threads()); + const auto threads_num = parallel_get_max_threads(); + std::vector errorMsgB(threads_num); auto threadBody_1 = [&](const int ithr, const int nthr) { size_t start(0lu), end(0lu); @@ -153,7 +154,7 @@ void CTCLoss::execute(dnnl::stream strm) { } // for batch }; // threadBody_1 - parallel_nt(0, threadBody_1); + parallel_nt(threads_num, threadBody_1); if (returnCode != 0) { std::string resErr(""); for (auto& err : errorMsgB) { diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 5c3a358dff9d38..c2d23bf9adc89e 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -1503,7 +1503,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { fullWorkAmount *= jep.dims[i]; } - size_t minimalConcurrency = parallel_get_max_threads(); + m_threads_num = static_cast(parallel_get_max_threads()); size_t minimalJitWorkAmount = 256; size_t currentJitWorkAmount = jep.dims[jep.dims.size() - 1]; int collapsedDims = 0; @@ -1516,6 +1516,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { for (size_t j = 1; j < inpDims.size(); j++) { if (inpDims[j].back() != inpDims[0].back()) { hasDifferentDims = true; + break; } } @@ -1538,7 +1539,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } size_t nextJitWorkAmount = currentJitWorkAmount * jep.dims[jep.dims.size() - 2]; - if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { + if (fullWorkAmount / nextJitWorkAmount >= m_threads_num) { currentJitWorkAmount = nextJitWorkAmount; collapsedDims++; @@ -1622,8 +1623,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { if (_pKernel->jep_.input_size == optimalTensorRank) { // execute Optimized 6D - parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], - [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + auto d6_loop = [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { auto args = jit_eltwise_call_args_indexes(); args.indexes[0] = i0; args.indexes[1] = i1; @@ -1632,7 +1632,11 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { args.indexes[4] = i4; (*_pKernel)(&args_ptrs, &args); - }); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_5d(ithr, nthr, dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], d6_loop); + }); } else { // execute Optimized Generic if (_pKernel->jep_.use_runtime_ptrs) { @@ -1642,7 +1646,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { _schedulerWorkAmount *= dims_out[i]; } } - parallel_nt(0, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; splitter(_schedulerWorkAmount, nthr, ithr, start, end); @@ -1676,6 +1680,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { std::unique_ptr _pKernel; size_t _schedulerWorkAmount = 0; size_t _batchDimIdx = 0; + size_t m_threads_num = 0lu; public: static const int optimalTensorRank = 6; diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 81f6f36b84dd89..d2629fe8fe6811 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -253,6 +253,7 @@ void Gather::createPrimitive() { if (isInPlace()) { return; } + m_threads_num = parallel_get_max_threads(); #if defined(OPENVINO_ARCH_X86_64) uint64_t idxElPerVec = 1; if (!isDynamicNode()) { @@ -294,11 +295,10 @@ void Gather::createPrimitive() { if (!isDynamicNode()) { const uint64_t dataElPerVec = jitKernel->getDataElPerVec(); - const uint64_t nthr = parallel_get_max_threads(); - const uint64_t wpt = ((totalWork / dataElPerVec) / nthr + 1) * dataElPerVec; - execParamsPerThread.resize(nthr); + const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; + execParamsPerThread.resize(m_threads_num); - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { const uint64_t dstStart = std::min(wpt * ithr, totalWork); const uint64_t dstEnd = std::min(wpt * (ithr + 1), totalWork); @@ -469,7 +469,7 @@ void Gather::execute(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(0, threadBody); + parallel_nt(m_threads_num, threadBody); return; } @@ -543,7 +543,7 @@ void Gather::executeDynamicImpl(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(0, threadBody); + parallel_nt(m_threads_num, threadBody); return; } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 96dad228f65b59..6ee097e9a1fbab 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -110,6 +110,7 @@ class Gather : public Node { bool have_scalar_scale = false; size_t zp_group_size = 1u; size_t scale_group_size = 1u; + size_t m_threads_num = 0lu; std::shared_ptr jitKernel; }; diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index 618d6b39105689..c8eed21bb312f5 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -149,11 +149,11 @@ void GridSample::createPrimitive() { } jitKernel->create_ker(); - nthr = parallel_get_max_threads(); - execParamsPerThread.resize(nthr); + m_threads_num = parallel_get_max_threads(); + execParamsPerThread.resize(m_threads_num); if (!x64::mayiuse(x64::avx512_core)) { const auto dataElPerVec = jitKernel->getDataElPerVec(); - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { auto& p = execParamsPerThread[ithr]; p.srcHeightF.resize(dataElPerVec); @@ -197,9 +197,9 @@ void GridSample::prepareParams() { const auto& srcDataShape = dataMemPtr->getStaticDims(); const auto& dstShape = dstMemPtr->getStaticDims(); const uint64_t totalWork = dstShape[2] * dstShape[3]; - const uint64_t wpt = ((totalWork / dataElPerVec) / nthr + 1) * dataElPerVec; + const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { const uint64_t dstStart = std::min(wpt * ithr, totalWork); const uint64_t dstEnd = std::min(wpt * (ithr + 1), totalWork); @@ -303,7 +303,7 @@ void GridSample::execute(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(nthr, threadBody); + parallel_nt(m_threads_num, threadBody); } void GridSample::executeDynamicImpl(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp index 0d172bd5c3e055..b4468d58be9b52 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp @@ -62,7 +62,7 @@ class GridSample : public Node { ov::element::Type dataPrecision; ov::element::Type gridPrecision = ov::element::f32; - int nthr = 1; + size_t m_threads_num = 0lu; std::vector execParamsPerThread; static constexpr size_t IN_DATA = 0; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 1543c168403382..6b6df3c3181ee0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -1068,11 +1068,15 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, } }); - parallel_for3d(B, H, q_len, [&](size_t b, size_t h, size_t pq) { + auto bhl_loop = [&](size_t b, size_t h, size_t pq) { auto* temp = buf_attn_score.ptr(0, b, pq, h); size_t temp_stride = buf_attn_score.stride(0); auto* dst = has_out_transpose ? output_emb.ptr(b, pq, h * SV) : output_emb.ptr(b, h, pq); attn_reduce(dst, temp, nthr, SV, temp_stride); + }; + + parallel_nt_static(nthr, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, q_len, bhl_loop); }); } diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp index 13c46a7c976cfd..8df1f5498da384 100644 --- a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp +++ b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp @@ -53,19 +53,19 @@ class LinearKsplit2 { OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); OPENVINO_ASSERT((K % reg_blk_K_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; - works.resize(nthr); + works.resize(m_threads_num); auto K_splits = 2; // split task on more cores is better on TBB - auto valid_nthr = nthr / 2; + auto valid_nthr = m_threads_num / 2; auto blkN_per_thread = (num_blk_N) / valid_nthr; auto blkN_leftover = num_blk_N - (blkN_per_thread * valid_nthr); auto start_blkN = 0; used_nthr = 0; - for (int ithr = 0; ithr < nthr; ithr += K_splits) { + for (int ithr = 0; ithr < m_threads_num; ithr += K_splits) { auto blkN = std::min(num_blk_N - start_blkN, blkN_per_thread); if (blkN_leftover > 0) { blkN_leftover--; @@ -106,7 +106,7 @@ class LinearKsplit2 { wbuffer.alloc(works, weight_element_size); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (is_quantized) { @@ -125,7 +125,7 @@ class LinearKsplit2 { float * w_scale) { static ReduceAdd2bh jit_reduce2cvt(true, std::is_same::value); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; auto& workC = work.m_C; if (work) { @@ -165,6 +165,9 @@ class LinearKsplit2 { } }); } + +private: + int m_threads_num = 0; }; template @@ -205,18 +208,18 @@ class LinearGateUp { // in unit of 32 OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); OPENVINO_ASSERT((K % reg_blk_K_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; - works.resize(nthr); + works.resize(m_threads_num); // split task on more cores is better on TBB - auto valid_nthr = nthr; + auto valid_nthr = m_threads_num; auto blkN_per_thread = (num_blk_N) / valid_nthr; auto blkN_leftover = num_blk_N - (blkN_per_thread * valid_nthr); auto start_blkN = 0; used_nthr = 0; - for (int ithr = 0; ithr < nthr; ithr ++) { + for (int ithr = 0; ithr < m_threads_num; ithr ++) { auto blkN = std::min(num_blk_N - start_blkN, blkN_per_thread); if (blkN_leftover > 0) { blkN_leftover--; @@ -243,7 +246,7 @@ class LinearGateUp { wbuffer.alloc(works, weight_element_size); DEBUG_LOG("Linear N,K=", N, ",", K, " used_nthr=", used_nthr); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (quantized_int8) @@ -267,7 +270,7 @@ class LinearGateUp { const LLMMLPNode::Config& config, MatrixDynQuantPerRow& src_dq, float * w_scale) { - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { work.run(M, pA, strideA_in_bytes); @@ -303,6 +306,9 @@ class LinearGateUp { } }); } + +private: + int m_threads_num = 0; }; template @@ -384,8 +390,8 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase { reinterpret_cast(ptr)); }); - auto nthr = parallel_get_max_threads(); - for (int ithr = 0; ithr < nthr; ithr++) { + m_threads_num = parallel_get_max_threads(); + for (size_t ithr = 0lu; ithr < m_threads_num; ithr++) { auto C1_size = gate_up.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto C2_size = down.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto max_C_size = std::max(C1_size, C2_size); @@ -482,6 +488,9 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase { dstC += BM * strideC / sizeof(T); } } + +private: + size_t m_threads_num = 0lu; }; #else template diff --git a/src/plugins/intel_cpu/src/nodes/mha.cpp b/src/plugins/intel_cpu/src/nodes/mha.cpp index 7d082e99fa4f6a..9364058c5d19a2 100644 --- a/src/plugins/intel_cpu/src/nodes/mha.cpp +++ b/src/plugins/intel_cpu/src/nodes/mha.cpp @@ -934,7 +934,7 @@ void MHA::prepareParams() { bool isAMXSupported = mayiuse(avx512_core_amx); - size_t numThreads = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); size_t matmulOptimalM = 32; @@ -1072,21 +1072,21 @@ void MHA::prepareParams() { bufferCompensation1Size = rnd_up(N1, N1_blk); if (brgCopyAKernel0) { - bufferMatMul0In0.resize(numThreads * bufferMatMul0In0Size); + bufferMatMul0In0.resize(m_threads_num * bufferMatMul0In0Size); } - bufferMatMul0In1.resize(numThreads * bufferMatMul0In1Size); - bufferMatMul0Out.resize(numThreads * bufferMatMul0OutSize); - bufferMatMul1In1.resize(numThreads * bufferMatMul1In1Size); - bufferMatMul1Out.resize(numThreads * bufferMatMul1OutSize); + bufferMatMul0In1.resize(m_threads_num * bufferMatMul0In1Size); + bufferMatMul0Out.resize(m_threads_num * bufferMatMul0OutSize); + bufferMatMul1In1.resize(m_threads_num * bufferMatMul1In1Size); + bufferMatMul1Out.resize(m_threads_num * bufferMatMul1OutSize); if (brgemmCtx0.is_with_comp) { - bufferCompensation0.resize(numThreads * bufferCompensation0Size); + bufferCompensation0.resize(m_threads_num * bufferCompensation0Size); } if (brgemmCtx1.is_with_comp) { - bufferCompensation1.resize(numThreads * bufferCompensation1Size); + bufferCompensation1.resize(m_threads_num * bufferCompensation1Size); } if (brgemmCtx0.is_with_amx || brgemmCtx1.is_with_amx) { - wsp.resize(numThreads * wsp_size_per_thread); + wsp.resize(m_threads_num * wsp_size_per_thread); } { @@ -1224,7 +1224,7 @@ void MHA::mhaImpl() { auto outPrcSize = outputPrecision.size(); - parallel_for2d(dimsMatMul0Out[0], dimsMatMul0Out[1], [&](size_t i0, size_t i1) { + auto spatial_loop = [&](size_t i0, size_t i1) { size_t threadNum = parallel_get_thread_num(); auto pTranspose0In0_aux = pTranspose0In0 + (i0 * strTranspose0In0[0] + i1 * strTranspose0In0[2]) * inputPrecisions[0].size(); // order 0213 @@ -1417,6 +1417,10 @@ void MHA::mhaImpl() { (*convertReorderKernel)(&call_args); } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, dimsMatMul0Out[0], dimsMatMul0Out[1], spatial_loop); }); } diff --git a/src/plugins/intel_cpu/src/nodes/mha.h b/src/plugins/intel_cpu/src/nodes/mha.h index cd272c086e2190..36afe20224299a 100644 --- a/src/plugins/intel_cpu/src/nodes/mha.h +++ b/src/plugins/intel_cpu/src/nodes/mha.h @@ -238,6 +238,8 @@ class MHA : public Node { std::unique_ptr mulAddSoftmaxKernel; std::unique_ptr convertReorderKernel; std::unique_ptr convertTransposeKernel; + + size_t m_threads_num = 0lu; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index 61aa4738b8f81f..76471b0cca741d 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -2417,9 +2417,9 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c const size_t H = shape5d[3]; const size_t W = shape5d[4]; - size_t threads_num = parallel_get_max_threads(); + const size_t threads_num = parallel_get_max_threads(); size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? 1 : rnd_up(C, blk_size) + blk_size; - parallel_for(N, [&](size_t b) { + auto b_loop = [&](size_t b) { std::vector mean_buffer(aux_buffer_size * threads_num, 0.f); std::vector variance_buffer; if (mvnAttrs.normalizeVariance_) { @@ -2429,7 +2429,7 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c // kernel_type: 0 for mean, 1 for variance, 2 for normalization auto worker = [&](const bool across_channel, const int kernel_type) { - parallel_nt(0, [&](const int ithr, const int nthr) { + parallel_nt(threads_num, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; splitter(D * H * W, nthr, ithr, start, end); @@ -2512,6 +2512,10 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c } worker(false, 2); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, N, b_loop); }); } @@ -2529,15 +2533,15 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co const size_t H = shape5d[3]; const size_t W = shape5d[4]; - size_t CB = div_up(C, blk_size); + const size_t CB = div_up(C, blk_size); - size_t C0 = W * blk_size; - size_t C1 = C0 * H; - size_t C2 = C1 * D; - size_t C3 = C2 * CB; - size_t C5 = C * D * H * W; + const size_t C0 = W * blk_size; + const size_t C1 = C0 * H; + const size_t C2 = C1 * D; + const size_t C3 = C2 * CB; + const size_t C5 = C * D * H * W; - size_t threads_num = parallel_get_max_threads(); + const size_t threads_num = parallel_get_max_threads(); size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? blk_size : rnd_up(C, blk_size); aux_buffer_size += blk_size; std::vector mean_buffer(aux_buffer_size * threads_num); @@ -2562,7 +2566,11 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co // // | // // \|/ ///////////////////////////////// - auto mean_buffer_ptr = &mean_buffer[aux_buffer_size * static_cast(parallel_get_thread_num())]; + auto thread_idx = static_cast(parallel_get_thread_num()); + if (thread_idx >= threads_num) { + return mean_internal; + } + auto mean_buffer_ptr = &mean_buffer[aux_buffer_size * thread_idx]; for (size_t i = 0; i < blk_size; i++) mean_buffer_ptr[i] = 0.f; @@ -2651,7 +2659,7 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co // one thread for one C*W size(the same H) to get C size result for the same H, added to last group result // keep the compute order the same as planar - parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { + auto dh_loop = [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_offset = b_offset + cb * C2 + d * C1 + h * C0; auto mean_buffer_ptr = &mean_buffer[blk_size * cb + aux_buffer_size * thr_idx]; @@ -2665,6 +2673,10 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.post_op_data = post_ops_data_; (*mvn_mean_kernel)(&arg); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, D, H, dh_loop); }); for (size_t i = 1; i < threads_num; i++) { @@ -2678,7 +2690,7 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co for (size_t i = 0; i < variance_buffer.size(); i++) variance_buffer[i] = 0.f; - parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { + auto dh_loop = [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_offset = b_offset + cb * C2 + d * C1 + h * C0; auto mean_buffer_ptr = &mean_buffer[blk_size * cb]; @@ -2694,7 +2706,12 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.post_op_data = post_ops_data_; (*mvn_variance_kernel)(&arg); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, D, H, dh_loop); }); + for (size_t i = 1; i < threads_num; i++) { for (size_t c = 0; c < C; c++) variance_buffer[c] += variance_buffer[c + aux_buffer_size * i]; diff --git a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp index 3260b12f1b5b4b..00c8b6f9b17c0b 100644 --- a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp +++ b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp @@ -60,6 +60,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { MemoryPtr m_scratchMem; uint8_t* m_scratch_base = nullptr; int m_M = 0; + size_t m_threads_num = 0lu; MatrixDynQuantPerRow m_quant_act; @@ -79,11 +80,11 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { auto K = w0.size(1); OPENVINO_ASSERT((K % cache_blk_k_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_K = K / cache_blk_k_size; int stride_in_bytes = K * weight_element_size; - works.resize(nthr); + works.resize(m_threads_num); int cur_work_id = 0; auto create_works = [&](void* pw, int output_id, int N, int valid_nthr) { @@ -119,7 +120,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { auto proj_size0 = m_node->m_config.proj_size0; auto proj_size1 = m_node->m_config.proj_size1; auto proj_size2 = m_node->m_config.proj_size2; - auto n_group_workers = allocate_workers({proj_size0, proj_size1, proj_size2}, nthr); + auto n_group_workers = allocate_workers({proj_size0, proj_size1, proj_size2}, m_threads_num); if (m_node->m_config.weights_combined) { auto* ptr_weights = reinterpret_cast(w0.ptr_v()); @@ -140,7 +141,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { wbuffer.alloc(works, weight_element_size); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (quantized_int8) @@ -237,7 +238,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { strideA = m_quant_act.K; } - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { work.run(BM, pA, strideA); diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index b40c50f957514f..6cfc94a02b9f3b 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2742,12 +2742,12 @@ inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { (*reduce_post_kernel)(&arg); }); } else if (layout == ReduceLayoutType::reduce_nspc) { - size_t num_threads = static_cast(parallel_get_max_threads()); + const size_t num_threads = static_cast(parallel_get_max_threads()); size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD; if (OP < num_threads && OW > blk_size) OP *= OH; size_t work_amount = OB * OC * OD * OH * OW / OP; - parallel_for(OP, [&](size_t op) { + auto op_loop = [&](size_t op) { const uint8_t *in_p = in_ptr + op * work_amount * intermediate_data_size; uint8_t *out_p = out_ptr + op * work_amount * dst_data_size; auto arg = jit_reduce_post_call_args(); @@ -2759,6 +2759,10 @@ inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { arg.divisor = &divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); (*reduce_post_kernel)(&arg); + }; + + parallel_nt_static(num_threads, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, OP, op_loop); }); } else { size_t OCB = div_up(OC, blk_size); diff --git a/src/plugins/intel_cpu/src/nodes/roi_align.cpp b/src/plugins/intel_cpu/src/nodes/roi_align.cpp index eb1797279e1415..27f9426dca6af9 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_align.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_align.cpp @@ -1076,7 +1076,7 @@ void ROIAlign::executeSpecified() { int bufSize = rnd_up(C, 16); size_t threadsNum = parallel_get_max_threads(); workingBuf.resize(bufSize * threadsNum, 0.f); - parallel_for3d(realRois, pooledH, pooledW, [&](int n, int yBinInd, int xBinInd) { + auto rhw_loop = [&](int n, int yBinInd, int xBinInd) { int numSamplesROI = numSamples[n]; // each sample have 4 values for srcAddressList and weight size_t binOffset = numSamplesROI * BLIParamsNum * pooledW * yBinInd + numSamplesROI * BLIParamsNum * xBinInd; @@ -1095,6 +1095,10 @@ void ROIAlign::executeSpecified() { arg.dst = static_cast(&dst[dstOffset]); arg.src_stride = lastBlockDim * W * H; // only valid for blk, nspc generate inside (*roi_align_kernel)(&arg); + }; + + parallel_nt_static(threadsNum, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, realRois, pooledH, pooledW, rhw_loop); }); } else { // one lane for one sample generation, then pooling all samples. diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index e229ff4bb72c57..f9f853230c4dd6 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -217,6 +217,7 @@ struct MHAKernel { size_t wsp_size_per_thread = 0; using tag = dnnl::memory::format_tag; using dt = dnnl::memory::data_type; + size_t m_threads_num = 0lu; struct brgemmKey { size_t M; size_t N; @@ -315,21 +316,21 @@ struct MHAKernel { wv_gemm_ptr = wv_result.first; - size_t nthr = static_cast(parallel_get_max_threads()); + m_threads_num = static_cast(parallel_get_max_threads()); // wsp is used to compute beta when K is blocked wsp_size_per_thread = wv_gemm_ptr->get_wsp_size(); - wsp.resize(nthr * wsp_size_per_thread); + wsp.resize(m_threads_num * wsp_size_per_thread); // allocate scratch a/b, notice get_scratch_a_size/get_scratch_b_size returns in bytes size_t data_size = sizeof(T); - qk_scratch_a.resize({nthr, qk_gemm_ptr->get_scratch_a_size() / data_size}); - wv_scratch_a.resize({nthr, wv_gemm_ptr->get_scratch_a_size() / data_size}); + qk_scratch_a.resize({m_threads_num, qk_gemm_ptr->get_scratch_a_size() / data_size}); + wv_scratch_a.resize({m_threads_num, wv_gemm_ptr->get_scratch_a_size() / data_size}); qk_scratch_b.resize({B, Hk, qk_gemm_ptr->get_scratch_b_size() / data_size}); wv_scratch_b.resize({B, Hk, wv_gemm_ptr->get_scratch_b_size() / data_size}); const size_t m_block_size = qk_gemm_ptr->get_mblk_size(); - weight_score.resize({static_cast(parallel_get_max_threads()), H, m_block_size, kv_len}); + weight_score.resize({m_threads_num, H, m_block_size, kv_len}); if (has_out_transpose) { fp32_out.resize({B, q_len, H, head_size_v}); } else { @@ -367,7 +368,7 @@ struct MHAKernel { }); // attention - parallel_for3d(B, H, m_blocks, [&](size_t ithr, size_t b, size_t h, size_t m_blk) { + auto bhb_loop = [&](size_t ithr, size_t b, size_t h, size_t m_blk) { auto m_start = m_blk * m_block_size; auto m_end = std::min(m_start + m_block_size, q_len); auto m_cnt = m_end - m_start; @@ -456,6 +457,10 @@ struct MHAKernel { 1); } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, m_blocks, bhb_loop); }); } @@ -652,12 +657,14 @@ struct MHAKernel { size_t m_block_size; // buffer to hold qk temp std::vector qk_buffers; + size_t m_threads_num = 0lu; MHAKernel() = delete; explicit MHAKernel(GraphContext::CPtr ctx): context(ctx) { m_block_size = 4; select_nfltmax_at_0 = false; - qk_buffers.resize(parallel_get_max_threads()); + m_threads_num = parallel_get_max_threads(); + qk_buffers.resize(m_threads_num); } PlainTensor causal_mask; @@ -699,7 +706,7 @@ struct MHAKernel { auto m_blocks = (q_len + m_block_size - 1) / m_block_size; - parallel_for3d(B, H, m_blocks, [&](size_t b, size_t h, size_t m_blk) { + auto bhb_loop = [&](size_t b, size_t h, size_t m_blk) { auto thread_id = parallel_get_thread_num(); if (thread_id < 0) OPENVINO_THROW("The calling thread isn't initialized!"); @@ -801,6 +808,10 @@ struct MHAKernel { has_out_transpose ? &output_emb.at({b, m_start, h * head_size_v}) : &output_emb.at({b, h, m_start}), has_out_transpose ? output_emb.stride(1) : output_emb.stride(2), 1); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, m_blocks, bhb_loop); }); } }; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index 4f974cfe5e9748..13671c22d102ae 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -348,6 +348,7 @@ StridedSlice::StridedSliceCommonExecutor::StridedSliceCommonExecutor(const Strid dimsNormalization(); dimsGluing(); indicesCalculation(); + m_threads_num = parallel_get_max_threads(); } void StridedSlice::StridedSliceCommonExecutor::orderParametersByLayouts(const BlockedMemoryDescCPtr& blockedMemoryDesc) { @@ -642,8 +643,7 @@ void StridedSlice::StridedSliceCommonExecutor::dimsGluing() { for (size_t idx = secondDim.first + 1; idx < secondDim.second; idx++) params.attrs.begin[1] /= dstBlockedDimsBefore[idx]; - const size_t maxThreads = parallel_get_max_threads(); - if (params.dstBlockedDims[0] < maxThreads) { + if (params.dstBlockedDims[0] < m_threads_num) { params.dstBlockedDims[1] /= realDstDim; params.srcBlockedDims[1] /= realSrcDim; params.dstBlockedDims.insert(params.dstBlockedDims.begin() + 1, realDstDim); @@ -682,8 +682,7 @@ void StridedSlice::StridedSliceCommonExecutor::indicesCalculation() { dstIndices.resize(workAmount, 0); // should choose more optimal thread count - const size_t nthr = parallel_get_max_threads(); - nThreads = nthr > workAmount ? workAmount : nthr; + nThreads = m_threads_num > workAmount ? workAmount : m_threads_num; if (params.isOptimized) { indicesCalculationForOptimized(); diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index 5c5950520bda7d..bf698643271d7a 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -122,6 +122,7 @@ class StridedSlice : public Node { size_t workAmount = 0lu; size_t lastDstDim = 0lu; size_t srcShift = 0lu; + size_t m_threads_num = 0lu; }; using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; From c7d8e038f2662dcbe8a81adc6691c594811e5e90 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 12:45:52 +0100 Subject: [PATCH 078/233] [CPU] Drop redundant MemoryOutput nodes (#27189) ### Details: In direct ReadValue->Assign pairs the Assign node is practically useless as there are no other layers that might modify data in between. Therefore, it does make sense to remove corresponding MemoryOutput nodes to eliminate additional overheads on their processing. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/graph_optimizer.cpp | 116 +++++++++++ src/plugins/intel_cpu/src/graph_optimizer.h | 1 + src/plugins/intel_cpu/src/memory_state.cpp | 43 +++++ src/plugins/intel_cpu/src/memory_state.h | 21 ++ src/plugins/intel_cpu/src/nodes/memory.cpp | 119 +++++++++++- src/plugins/intel_cpu/src/nodes/memory.hpp | 59 +++++- .../src/common/read_value_assign.cpp | 182 ++++++++++++++++++ .../template/src/sync_infer_request.cpp | 2 +- src/plugins/template/src/variable_state.hpp | 32 ++- .../include/subgraph_tests/lora_pattern.hpp | 10 + .../subgraph/lora_pattern.hpp | 3 +- .../src/subgraph/lora_pattern.cpp | 74 +++++++ 12 files changed, 640 insertions(+), 22 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 6b3175e24d9dcb..ab7eb223ba17ce 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -183,6 +183,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { MatchSdpaKvCache(graph); graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "DropRedundantMemoryOutput"); + DropRedundantMemoryOutput(graph); + graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveDroppedEdges"); graph.RemoveDroppedEdges(); } @@ -3186,5 +3190,117 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } +void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { + // When we have a MemoryInput->MemoryOutput pair, that means that the state is immediately populated with the init + // subgraph values when the init subgraph exists. In all the other cases the state is simply a read only object. + // We can optimize such a case removing the MemoryOutput node and transferring the state values update + // responsibility to a special type of the MemoryInput node - MemoryInputSingle + auto& graphNodes = graph.GetNodes(); + + auto isSuitableMemInput = [](const NodePtr& node) -> bool { + if (Type::MemoryInput != node->getType()) { + return false; + } + + CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_isSuitableMemInput); + + auto memInputBase = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(memInputBase, + "Unexpectedly wrong dynamic type of node: ", + node->getName(), + " of type: ", + node->getTypeStr()); + + auto id = memInputBase->getId(); + + NodePtr MemoryOutput = nullptr; + auto&& childEdges = node->getChildEdgesAtPort(0); + for (auto&& item : childEdges) { + auto childNode = item->getChild(); + + if (Type::MemoryOutput == childNode->getType()) { + auto memOutputBase = std::dynamic_pointer_cast(childNode); + OPENVINO_ASSERT(memInputBase, + "Unexpectedly wrong dynamic type of node: ", + node->getName(), + " of type: ", + node->getTypeStr()); + + if (memOutputBase->getId() != id) { + return false; // an Assign node from different Variable is attached + } + + if (MemoryOutput && MemoryOutput != childNode) { + //only one child MemoryOutput is expected + return false; + } + MemoryOutput = childNode; + } + } + return nullptr != MemoryOutput; + }; + + for (size_t i = 0; i < graphNodes.size(); i++) { + auto node = graphNodes[i]; + if (!isSuitableMemInput(node)) { + continue; + } + + CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_Node); + + auto memInputNode = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type"); + + ov::optional inputShape; + ov::optional inputPrc; + + if (!node->getParentEdges().empty()) { + inputShape = ov::optional(node->getInputShapeAtPort(0)); + inputPrc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); + } + + //search for the MemoryOutputNode + NodePtr memoryOutputNode; + for (auto&& edge : node->getChildEdgesAtPort(0)) { + auto child = edge->getChild(); + if (Type::MemoryOutput == child->getType()) { + memoryOutputNode = child; + break; + } + } + OPENVINO_ASSERT(memoryOutputNode, "Corresponding MemoryOutput has not been found"); + + graph.RemoveEdge(memoryOutputNode->getParentEdgeAt(0)); + // there are no output edges from MemoryOutput nodes + + // now replace the existing MemoryInput with a special type that works without the corresponding MemoryOutput + auto memInputSingle = std::make_shared(memInputNode->getId(), + memInputNode->getName(), + memInputNode->getTypeStr(), + memInputNode->getOutputShapeAtPort(0), + memInputNode->getOriginalOutputPrecisionAtPort(0), + graph.getGraphContext(), + inputShape, + inputPrc); + + graph.AddNode(memInputSingle); + + if (!memInputNode->getParentEdges().empty()) { + auto parentEdge = memInputNode->getParentEdgeAt(0); + auto parent = parentEdge->getParent(); + const auto inputNum = parentEdge->getInputNum(); + graph.RemoveEdge(parentEdge); + graph.CreateEdge(parent, memInputSingle, inputNum, 0); + } + + for (auto&& edge : memInputNode->getChildEdgesAtPort(0)) { + auto child = edge->getChild(); + const auto outputNum = edge->getOutputNum(); + graph.RemoveEdge(edge); + graph.CreateEdge(memInputSingle, child, 0, outputNum); + } + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index 0a85a253ba8d66..886296a7c0053b 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -52,6 +52,7 @@ class GraphOptimizer { void RemoveMemoryInputConvert(Graph &graph); void RemoveConvertMemoryOutput(Graph &graph); void MatchSdpaKvCache(Graph &graph); + void DropRedundantMemoryOutput(Graph &graph); bool canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode); // Method checks that after the sequential execution of Transpose and Reorder nodes, diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index f5f76fe42feb48..bf77917497de77 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -156,6 +156,49 @@ MemoryPtr VariableStateDoubleBuffer::internal_state_mem() const { return prime_mem(); } +VariableStateSingleBuffer::VariableStateSingleBuffer(const std::string& name, + const MemoryPtr& external_buffer, + const MemoryDescPtr& external_desc) + : VariableStateBase(name, external_desc) { + OPENVINO_ASSERT(external_buffer); + m_internal_mem = external_buffer; + m_internal_desc = m_internal_mem->getDescPtr(); + auto&& shape = m_internal_desc->getShape(); + + if (shape.isStatic()) { + m_internal_mem->nullify(); + } else { + // in the case of the original desc has dynamic shape we create an empty tensor + auto new_desc = to_static(m_internal_desc); + m_internal_mem->redefineDesc(new_desc); + } +} +MemoryPtr VariableStateSingleBuffer::input_mem() { + return m_internal_mem; +} +MemoryPtr VariableStateSingleBuffer::output_mem() { + return m_internal_mem; +} +MemoryDescPtr VariableStateSingleBuffer::internal_desc() const { + return m_internal_desc; +} + +void VariableStateSingleBuffer::reset_impl() { + auto new_desc = to_static(m_internal_desc); + if (m_internal_mem) { + m_internal_mem->redefineDesc(new_desc); + m_internal_mem->nullify(); + } +} + +MemoryPtr VariableStateSingleBuffer::internal_state_mem() const { + return m_internal_mem; +} + +void VariableStateSingleBuffer::commit_impl() { + // nothing to do +} + VariableStateKVcache::VariableStateKVcache( const std::string& name, const MemoryDescPtr& external_desc, diff --git a/src/plugins/intel_cpu/src/memory_state.h b/src/plugins/intel_cpu/src/memory_state.h index b4c52903d12f31..e7493f327e93fa 100644 --- a/src/plugins/intel_cpu/src/memory_state.h +++ b/src/plugins/intel_cpu/src/memory_state.h @@ -94,6 +94,27 @@ class VariableStateDoubleBuffer : public VariableStateBase { size_t buffer_num = 0; }; +class VariableStateSingleBuffer : public VariableStateBase { +public: + VariableStateSingleBuffer(const std::string& name, + const MemoryPtr& external_buffer, + const MemoryDescPtr& external_desc); + + MemoryPtr input_mem() override; + MemoryPtr output_mem() override; + MemoryDescPtr internal_desc() const override; + +private: + void reset_impl() override; + void commit_impl() override; + + MemoryPtr internal_state_mem() const override; + +private: + MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryPtr m_internal_mem; +}; + class VariableStateKVcache : public VariableStateBase { public: VariableStateKVcache(const std::string& name, diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index 756fbc5b578f61..565597bdcc2a9e 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -377,7 +377,8 @@ bool MemoryInputBase::isSupportedOperation(const std::shared_ptr } MemoryInputBase::MemoryInputBase(const std::shared_ptr& op, const GraphContext::CPtr ctx) - : Input(op, ctx), MemoryStateNode(op) { + : Input(op, ctx), + MemoryStateNode(op) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -385,6 +386,7 @@ MemoryInputBase::MemoryInputBase(const std::shared_ptr& op, const Grap if (created()) { context->getMemoryStatesRegister()->registerInput(this); } + executeHook = &MemoryInputBase::assignState; } MemoryInputBase::MemoryInputBase(const std::string id, @@ -394,8 +396,10 @@ MemoryInputBase::MemoryInputBase(const std::string id, const ov::element::Type& output_prc, const GraphContext::CPtr context, const ov::optional& input_shape, - const ov::optional& input_prc) : - Input(output_shape, output_prc, name, type, context), MemoryStateNode(id) { + const ov::optional& input_prc, + MemoryInputBase::mode mode) + : Input(output_shape, output_prc, name, type, context), + MemoryStateNode(id) { outputShapes.emplace_back(output_shape); addOriginalOutputPrecision(output_prc); if (input_shape) { @@ -411,6 +415,17 @@ MemoryInputBase::MemoryInputBase(const std::string id, if (created()) { context->getMemoryStatesRegister()->registerInput(this); } + + // this important to prevent identifying it as a const when it's on a const path + constant = ConstantType::StrictNoConst; + + if (mode::read_value_assign == mode) { + executeHook = &MemoryInputBase::assignState; + } else if (mode::single_read_value == mode) { + executeHook = &MemoryInputBase::bypassAssignState; + } else { + THROW_CPU_NODE_ERR("Unexpected MemoryInput mode"); + } } MemoryInputBase::~MemoryInputBase() { @@ -513,15 +528,26 @@ void MemoryInputBase::assignState(MemStatePtr newState) { } void MemoryInputBase::execute(dnnl::stream strm) { - getOutputNode().assignState(getAssignedState()); + assert(executeHook && "executeHook is not initialized!"); + (this->*executeHook)(); runStatic(strm); } void MemoryInputBase::executeDynamicImpl(dnnl::stream strm) { - getOutputNode().assignState(getAssignedState()); + assert(executeHook && "executeHook is not initialized!"); + (this->*executeHook)(); runDynamic(strm); } +void MemoryInputBase::assignState() { + getOutputNode().assignState(getAssignedState()); +} + +void MemoryInputBase::bypassAssignState() { + // nothing to do + return; +} + bool MemoryInput::needInitGraphProcessing() const { return !getParentEdges().empty() && getAssignedState()->is_reset_state(); } @@ -828,6 +854,89 @@ void MemoryInputSDPA::resolveInPlaceEdges(Edge::LOOK look) { } } +MemoryInputSingle::MemoryInputSingle(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc) + : MemoryInput(id, + name, + type, + output_shape, + output_prc, + context, + input_shape, + input_prc, + MemoryInputBase::mode::single_read_value) {} + +MemStatePtr MemoryInputSingle::makeState() const { + // assume ov::Tensor is always dense + auto original_desc = + std::make_shared(getOriginalOutputPrecisionAtPort(0), outputShapes.at(0)); + + auto mem_desc = getBaseMemDescAtOutputPort(0); + const auto& eng = getEngine(); + + auto state_name = getId(); + + // Remove suffix with pair ID. Internal information. + auto suffix_idx = state_name.find("/id="); + if (suffix_idx != std::string::npos) { + state_name = state_name.substr(0, suffix_idx); + } + + return std::make_shared(state_name, + std::make_shared(eng, mem_desc), + original_desc); +} + +void MemoryInputSingle::runStatic(dnnl::stream strm) { + MemoryInput::runStatic(strm); + if (needInitGraphProcessing()) { + // since there is no corresponding MemoryOutput node, we need to update the state here + auto result = getDstMemoryAtPort(0); // only one output port + auto stateMem = getAssignedState()->output_mem(); + CPU_NODE_ASSERT(stateMem, " state memory has nullptr"); + if (result->getData() != stateMem->getData()) { + stateMem->load(*result); + } + } + getAssignedState()->commit(); // since we don't use MemoryOutput, commit must be called to change the reset state +} + +void MemoryInputSingle::runDynamic(dnnl::stream strm) { + MemoryInput::runDynamic(strm); + if (needInitGraphProcessing()) { + // since there is no corresponding MemoryOutput node, we need to update the state here + auto result = getDstMemoryAtPort(0); // only one output port + auto state = getAssignedState(); + auto stateMem = state->output_mem(); + CPU_NODE_ASSERT(stateMem, " state memory has nullptr"); + + const auto& newShape = result->getShape(); + const auto& stateShape = stateMem->getShape(); + + if (stateShape.isDynamic() || stateShape.getStaticDims() != newShape.getStaticDims()) { + auto extMemDesc = state->internal_desc(); + auto newExternDesc = extMemDesc->cloneWithNewDims(newShape.getStaticDims()); + stateMem->redefineDesc(newExternDesc); + } + + if (result->getData() != stateMem->getData()) { + stateMem->load(*result); + } + } + getAssignedState()->commit(); // since we don't use MemoryOutput, commit must be called to change the reset state +} + +bool MemoryInputSingle::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { + return MemoryInput::isSupportedOperation(op, errorMessage); +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index c158d738a36148..f503a8d58386a5 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -120,16 +120,14 @@ class MemoryOutputStub : public MemoryOutputBase { }; class MemoryInputBase : public Input, public MemoryStateNode { +public: + enum class mode { + read_value_assign, + single_read_value + }; + public: MemoryInputBase(const std::shared_ptr& op, const GraphContext::CPtr context); - MemoryInputBase(const std::string id, - const std::string& name, - const std::string& type, - const Shape& output_shape, - const ov::element::Type& output_prc, - const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc); ~MemoryInputBase() override; @@ -152,6 +150,17 @@ class MemoryInputBase : public Input, public MemoryStateNode { MemoryOutputBase& getOutputNode(); void assignState(MemStatePtr newState) override final; // NOLINT +protected: + MemoryInputBase(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc, + mode mode = mode::read_value_assign); + protected: virtual void runStatic(dnnl::stream strm) = 0; virtual void runDynamic(dnnl::stream strm) = 0; @@ -160,12 +169,20 @@ class MemoryInputBase : public Input, public MemoryStateNode { return state; } +private: + using executeHookPtr = void (MemoryInputBase::*)(void); + +private: + void assignState(); + void bypassAssignState(); + private: /** * @brief keeps reference to output sibling node */ MemoryOutputBase* outputNode = nullptr; MemStatePtr state = nullptr; + executeHookPtr executeHook; }; class MemoryInput : public MemoryInputBase { @@ -179,16 +196,38 @@ class MemoryInput : public MemoryInputBase { MemStatePtr makeState() const override; -private: +protected: + bool needInitGraphProcessing() const; void runStatic(dnnl::stream strm) override; void runDynamic(dnnl::stream strm) override; + +private: void assignStateHook() override {/*pass*/} - bool needInitGraphProcessing() const; private: ProxyMemoryBlockPtr memBlock = nullptr; }; +class MemoryInputSingle : public MemoryInput { +public: + MemoryInputSingle(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc); + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + MemStatePtr makeState() const override; + +private: + void runStatic(dnnl::stream strm) override; + void runDynamic(dnnl::stream strm) override; +}; + class MemoryInputSDPA : public MemoryInputBase { public: MemoryInputSDPA(const std::string id, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp new file mode 100644 index 00000000000000..c6e976b321f703 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/node_builders/constant.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" + +/* The main purpose of this test set is to test ReadValue->Assign direct connection optimizations, i.e. + dropping the MemoryOutput node. +*/ + +namespace ov { +namespace test { + +using namespace CPUTestUtils; + +// ┌────────┐ ┌────────┐ +// │ Param2 │ │ Param1 │ |---------------| +// └───┬────┘ └────┬───┘ | | +// │ |-----------|┌─────────┐ | +// │ | │ │Constant │ | +// │ | │ └───┬─────┘ | +// │ | ┌───┴────┐ │ | +// │ | │Multiply├─────┘ | +// │ | └───┬────┘ | <- Optional Init Subgraph +// │ | │ ┌─────────┐ | +// │ | │ │Constant │ | +// │ | │ └───┬─────┘ | +// │ | ┌───┴────┐ │ | +// │ | │ Add ├─────┘ | +// │ | └───┬────┘ | +// │ | │ | +// │ |---------------------------| +// │ │ +// │ │ +// │ │ +// │ ┌─────┴─────┐ +// │ │ ReadValue │ +// │ └─────┬─────┘ +// │ │ \ +// │ ┌──┴──┐ \ +// └────────┤ Add │ \┌────────┐ +// └──┬──┘ │ Assign │ +// │ └────────┘ +// │ +// ┌────┴────┐ +// │ Result1 │ +// └─────────┘ + +typedef std::tuple< + bool, // include init subgraph + CPUSpecificParams +> ReadValueAssignTestParams; + +class ReadValueAssignTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + bool use_init_subgraph = false; + CPUSpecificParams cpu_params; + std::tie(use_init_subgraph, cpu_params) = obj.param; + + std::ostringstream results; + results << "Init_Graph=" << (use_init_subgraph ? "True" : "False") << "_"; + results << CPUTestsBase::getTestCaseName(cpu_params); + return results.str(); + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + const ov::Shape tensor_shape = {3, 32, 7, 7}; + + InputShape param1_shape = {{-1, 32, -1, -1}, {tensor_shape}}; + InputShape param2_shape = {{-1, -1, -1, -1}, {tensor_shape}}; + + bool use_init_subgraph = false; + CPUSpecificParams cpu_params; + std::tie(use_init_subgraph, cpu_params) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpu_params; + selectedType = makeSelectedTypeStr(selectedType, net_prc); + + init_input_shapes({param1_shape, param2_shape}); + + ov::ParameterVector params; + params.push_back(std::make_shared(net_prc, inputDynamicShapes[0])); + params.push_back(std::make_shared(net_prc, inputDynamicShapes[1])); + std::shared_ptr last_node = params.front(); + + if (use_init_subgraph) { + //build init subgraph + auto const1 = utils::make_constant(net_prc, tensor_shape); + auto const2 = utils::make_constant(net_prc, tensor_shape); + auto multiply = utils::make_eltwise(last_node, const1, utils::EltwiseTypes::MULTIPLY); + auto add = utils::make_eltwise(multiply, const2, utils::EltwiseTypes::ADD); + last_node = add; + } + + const std::string variable_name("variable0"); + auto variable = std::make_shared( + ov::op::util::VariableInfo{inputDynamicShapes[0], net_prc, variable_name}); + + auto read = std::make_shared(last_node, variable); + auto assign = std::make_shared(read, variable); + auto add = utils::make_eltwise(params[1], read, utils::EltwiseTypes::ADD); + + add->get_rt_info() = getCPUInfo(); + auto res = std::make_shared(add); + + function = + std::make_shared(ov::ResultVector({res}), ov::SinkVector({assign}), params, "ReadValueAssign"); + } + +protected: + const ov::Shape tensor_shape = {3, 32, 7, 7}; + const ElementType net_prc = element::f32; +}; + +TEST_P(ReadValueAssignTest, CompareWithRefs) { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, ov::test::utils::DEVICE_TEMPLATE); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr int infer_count = 3lu; + + auto&& states = inferRequest.query_state(); + auto&& refStates = inferRequestRef.query_state(); + + for (int i = 0; i < infer_count; ++i) { + // set states + + if (i & 0x1) { + //reset every odd iteration + states.front().reset(); + refStates.front().reset(); + } else { + // generate and set state tensors every even iteration + using ov::test::utils::InputGenerateData; + + auto tensor = + ov::test::utils::create_and_fill_tensor(net_prc, tensor_shape, InputGenerateData{0, 10, 1, i}); + states.front().set_state(tensor); + refStates.front().set_state(tensor); + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto result = inferRequest.get_tensor(outputs[0]); + + auto result_ref = inferRequestRef.get_tensor(outputs[0]); + + ov::test::utils::compare(result, result_ref, 1e-4, 1e-4); + } + CheckNumberOfNodesWithTypes(compiledModel, {"MemoryOutput", "Assign"}, 0); +} + +INSTANTIATE_TEST_SUITE_P( + smoke_ReadValue_Assign, + ReadValueAssignTest, + ::testing::Combine(::testing::Values(true, false), + ::testing::Values(CPUSpecificParams{{nchw, nchw}, {nchw}, {""}, "any_type"}, + CPUSpecificParams{{nhwc, nhwc}, {nhwc}, {""}, "any_type"})), + ReadValueAssignTest::getTestCaseName); +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/template/src/sync_infer_request.cpp b/src/plugins/template/src/sync_infer_request.cpp index 41881e9839adaf..418f8f1b717a99 100644 --- a/src/plugins/template/src/sync_infer_request.cpp +++ b/src/plugins/template/src/sync_infer_request.cpp @@ -59,7 +59,7 @@ void collect_variables(const std::shared_ptr& ov_model, ov::Tensor tensor = ov::Tensor(variable->get_info().data_type, shape); variable_context.set_variable_value(variable, std::make_shared(tensor)); auto state = - std::make_shared(variable->get_info().variable_id, + std::make_shared(variable->get_info(), variable_context.get_variable_value(variable)); list_of_variables.emplace_back(state); } diff --git a/src/plugins/template/src/variable_state.hpp b/src/plugins/template/src/variable_state.hpp index 8227a22c0fe93c..d6f0972f8675f3 100644 --- a/src/plugins/template/src/variable_state.hpp +++ b/src/plugins/template/src/variable_state.hpp @@ -4,6 +4,7 @@ #pragma once +#include "openvino/op/util/variable.hpp" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/ivariable_state.hpp" #include "openvino/runtime/so_ptr.hpp" @@ -13,16 +14,35 @@ namespace template_plugin { class VariableState : public ov::IVariableState { public: - VariableState(const std::string& name, const std::shared_ptr& variable_value) - : ov::IVariableState(name), + VariableState(const ov::op::util::VariableInfo& variable_info, + const std::shared_ptr& variable_value) + : ov::IVariableState(variable_info.variable_id), + m_data_shape(variable_info.data_shape), + m_data_type(variable_info.data_type), m_variable_value(variable_value) { m_state = get_tensor_impl(variable_value->get_state()); } void set_state(const ov::SoPtr& state) override { - OPENVINO_ASSERT(state->get_shape() == m_state->get_shape(), "Wrong tensor shape."); - OPENVINO_ASSERT(state->get_element_type() == m_state->get_element_type(), "Wrong tensor type."); - OPENVINO_ASSERT(state->get_byte_size() == m_state->get_byte_size(), "Blob size of tensors are not equal."); + OPENVINO_ASSERT(m_data_shape.compatible(state->get_shape()), + "Wrong tensor shape: ", + state->get_shape(), + " is not compatible with expected: ", + m_data_shape, + " in a variable with ID: ", + this->get_name()); + OPENVINO_ASSERT(m_data_type.compatible(state->get_element_type()), + "Wrong tensor type: ", + state->get_element_type(), + " expected: ", + m_data_type, + " in a variable with ID: ", + this->get_name()); + m_state->set_shape(state->get_shape()); + OPENVINO_ASSERT(state->get_byte_size() == m_state->get_byte_size(), + "Blob size of tensors are not equal. Variable with ID: ", + this->get_name()); std::memcpy(m_state->data(), state->data(), state->get_byte_size()); + m_variable_value->set_reset(false); } void reset() override { @@ -33,6 +53,8 @@ class VariableState : public ov::IVariableState { ~VariableState() override = default; private: + PartialShape m_data_shape; // original shape + element::Type m_data_type; // original type std::shared_ptr m_variable_value; }; } // namespace template_plugin diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp index 8f9687b7b93b2a..42f70aa92474a3 100644 --- a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp +++ b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp @@ -19,5 +19,15 @@ TEST_P(LoraPatternConvolution, empty_tensors) { run_test_empty_tensors(); } +TEST_P(LoraPatternMatmul, random_tensors) { + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_random_tensors(); +} + +TEST_P(LoraPatternConvolution, random_tensors) { + targetStaticShapes = {{{1, num_channels, 64, 64}}}; + run_test_random_tensors(); +} + } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp index 16764d37dcf688..9b38ca059f1aba 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp @@ -15,6 +15,7 @@ class LoraPatternBase : public SubgraphBaseTest { protected: void run_test_empty_tensors(); + void run_test_random_tensors(); protected: static constexpr auto t4_name = "lora/MatMul.B"; @@ -37,7 +38,7 @@ class LoraPatternConvolution : public LoraPatternBase, public testing::WithParam void SetUp() override; protected: - static constexpr size_t num_channels = 320ul; + static constexpr size_t num_channels = 64ul; }; } // namespace test diff --git a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp index 6f74fd09b022a6..d40872f0756d6e 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp @@ -8,6 +8,7 @@ #include "common_test_utils/node_builders/convolution.hpp" #include "common_test_utils/ov_tensor_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" +#include "template/properties.hpp" namespace ov { namespace test { @@ -37,6 +38,79 @@ void LoraPatternBase::run_test_empty_tensors() { ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4); } +void LoraPatternBase::run_test_random_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, + ov::test::utils::DEVICE_TEMPLATE, + {{ov::template_plugin::disable_transformations(true)}}); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr size_t lora_order = 25lu; + constexpr int infer_count = 6lu; + + std::unordered_map stateShapes; + + auto&& vars = function->get_variables(); + + for (auto&& var : vars) { + auto var_info = var->get_info(); + auto var_shape = var_info.data_shape; + + std::for_each(var_shape.begin(), var_shape.end(), [=](ov::PartialShape::value_type& x) { + if (x.is_dynamic()) { + x = lora_order; + } + }); + stateShapes.insert({var_info.variable_id, var_shape.to_shape()}); + } + + for (int i = 0; i < infer_count; ++i) { + // set states + + auto&& states = inferRequest.query_state(); + if (!(i & 0x1)) { // every even call + // generate and set state tensors + for (auto&& item : states) { + auto&& refStates = inferRequestRef.query_state(); + using ov::test::utils::InputGenerateData; + const auto& shape = stateShapes.at(item.get_name()); + auto tensor = ov::test::utils::create_and_fill_tensor(netType, shape, InputGenerateData{0, 10, 1, i}); + item.set_state(tensor); + auto itr = std::find_if(refStates.begin(), refStates.end(), [&](const ov::VariableState& state) { + return state.get_name() == item.get_name(); + }); + ASSERT_FALSE(itr == refStates.end()); + itr->set_state(tensor); + } + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + + auto tx_result_ref = inferRequestRef.get_tensor(outputs[0]); + auto tz_result_ref = inferRequestRef.get_tensor(outputs[1]); + + ov::test::utils::compare(tx_result, tx_result_ref, 1e-4, 1e-4); + ov::test::utils::compare(tz_result, tz_result_ref, 1e-4, 1e-4); + } +} + void LoraPatternMatmul::SetUp() { targetDevice = this->GetParam(); From af18322643b2df57345a8e312bcf8d70bb185dbf Mon Sep 17 00:00:00 2001 From: Sungeun Kim Date: Tue, 29 Oct 2024 20:50:34 +0900 Subject: [PATCH 079/233] [GPU] update onednn_3.7pc: 32ad05ab (#27264) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index e99a84e4914a81..32ad05ab263b78 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit e99a84e4914a818c64165a4b52785f606e405c2b +Subproject commit 32ad05ab263b782d4a4455ea85f5de009cf607c4 From 015de6d6de046a49a1c4f421eff0e3039d9a8a45 Mon Sep 17 00:00:00 2001 From: darksapien23151 <141660450+darksapien23151@users.noreply.github.com> Date: Tue, 29 Oct 2024 17:36:48 +0530 Subject: [PATCH 080/233] Update android_x64.yml (#27257) ### Details: - Enabling test building by setting {ENABLE_TEST=ON} ### Tickets: - 149906 --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/android_x64.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/android_x64.yml b/.github/workflows/android_x64.yml index a667a07da5bd3e..1cdb2023784979 100644 --- a/.github/workflows/android_x64.yml +++ b/.github/workflows/android_x64.yml @@ -135,6 +135,7 @@ jobs: -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \ -DENABLE_LTO=ON \ -DENABLE_PYTHON=OFF \ + -DENABLE_TESTS=ON \ -DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_GENAI_REPO }} \ -S ${OPENVINO_REPO} \ -B ${BUILD_DIR} From a7ccc5e0efcc55455e4f2988489a64d70e6be0f7 Mon Sep 17 00:00:00 2001 From: Attila Csok Date: Tue, 29 Oct 2024 14:07:58 +0200 Subject: [PATCH 081/233] [intel-npu] Bugfix for total allocable memory property (#27270) ### Details: - Bugfix in zero_device to return correct maximum allocable memory size in NPU_DEVICE_TOTAL_MEM_SIZE property. - For old drivers we return hardcoded 2GB value (compiler limitation) - For graph_ext 1.8 windows drivers we just convert KB to B - For graph_ext >1.9 drivers we return values from driver as is ### Tickets: - *EISW-143246* --- .../intel_npu/src/backend/src/zero_device.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index ac60e4741947bd..439b5fbd59f4f9 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -162,12 +162,22 @@ uint64_t ZeroDevice::getAllocMemSize() const { } uint64_t ZeroDevice::getTotalMemSize() const { +#define LEGACY_MAX_MEM_ALLOC_SIZE_BYTES (2147483648) // 2GB in base-2 + ze_graph_memory_query_t query{}; ze_result_t result = _graph_ddi_table_ext.pfnQueryContextMemory(_initStructs->getContext(), ZE_GRAPH_QUERY_MEMORY_DDR, &query); THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _graph_ddi_table_ext); - return query.total; + // For drivers with graph_extension < 1.9 we report fixed 2GB max allocation size (old drivers don't support more) + // For drivers with graph_extension > 1.9 we report the value they return + if (_initStructs->isExtensionSupported(std::string(ZE_GRAPH_EXT_NAME), ZE_MAKE_VERSION(1, 9))) { + // we are safe here, can return the value directly from driver + return query.total; + } + + // Default for older drivers: return 2GB + return LEGACY_MAX_MEM_ALLOC_SIZE_BYTES; } ov::device::PCIInfo ZeroDevice::getPciInfo() const { From fc5f897442bdf8a43e08301e938e8cec56b0b1f2 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 13:18:51 +0100 Subject: [PATCH 082/233] [CPU] Disable parallel UpdateShapes/PrepairParams processing if there are many sync nodes (#27280) ### Details: If there are too many sync nodes in a model, it's not beneficial to run UpdateShapes and PrepairParams stages in parallel as the parallel tasks spawning/synchronization overheads outweigh the performance gain of the parallel execution. As a quantitative characteristic that defines the boundary between parallel and sequential node processing strategy, we use the ratio between the total number of nodes and the number of synchronous nodes. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/graph.cpp | 17 ++++++++++++++--- src/plugins/intel_cpu/src/graph.h | 18 ++++++++++++++---- src/plugins/intel_cpu/src/infer_request.cpp | 12 ++++++------ .../intel_cpu/src/nodes/tensoriterator.cpp | 2 +- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 45118763a3eaf9..f9bfa9334eae8f 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -371,9 +371,20 @@ void Graph::Activate(const std::vector& externalInputMemory, std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) - : Status::ReadyStatic; - + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; + } + } else { + status = Status::ReadyStatic; + } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index b3634800fb2e05..d50ccc152c9186 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -49,8 +49,16 @@ class Graph { ~Graph(); - bool IsReady() { - return one_of(status, Status::ReadyStatic, Status::ReadyDynamic, Status::ReadyDynamicSeq); + bool IsStatic() const { + return Status::ReadyStatic == status; + } + + bool IsDynamic() const { + return one_of(status, Status::ReadyDynamic, Status::ReadyDynamicSeq); + } + + bool IsReady() const { + return IsStatic() || IsDynamic(); } const Config & getConfig() const { @@ -193,7 +201,6 @@ class Graph { return graphHasDynamicInput; } - Status getStatus() const {return status;} const std::unordered_map& getInternalStateNodes() const; /** @@ -210,6 +217,10 @@ class Graph { void Activate(const std::vector& externalInputMemory = {}, const std::vector& externalOutputMemory = {}); + const std::unordered_map& getOutputNodesMemBlocksMap() const { + return outputNodesMemBlocksMap; + } + protected: void ForgetGraphData() { status = Status::NotReady; @@ -273,7 +284,6 @@ class Graph { template void InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update); - friend class intel_cpu::SyncInferRequest; friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); private: diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f255a46efe7d0a..f0b817dcda859c 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -140,7 +140,7 @@ void SyncInferRequest::infer() { throw_if_canceled(); // update output control blocks, if any, in order to refresh internal buffers - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { + if (m_graph->IsDynamic()) { for (auto&& item : m_outputControlBlocks) { item.second.update(); } @@ -178,7 +178,7 @@ void SyncInferRequest::change_default_ptr() { std::unordered_set inputPtrs; std::function& tensor)> changeInpPtr; - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { + if (m_graph->IsDynamic()) { changeInpPtr = [&inputPtrs](const EdgePtr &edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); inputPtrs.insert(tensor->data()); @@ -278,8 +278,8 @@ void SyncInferRequest::change_default_ptr() { change_edge_ptr(parentEdge, it.second); } - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { - const auto &outMemBlocksMap = m_graph->outputNodesMemBlocksMap; + if (m_graph->IsDynamic()) { + const auto &outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); for (auto&& item : outMemBlocksMap) { const auto& name = item.first; @@ -476,7 +476,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn ov::SoPtr tensor; if (type == ov::ISyncInferRequest::FoundPort::Type::INPUT) { - OPENVINO_ASSERT(m_graph->inputNodesMap.find(port_index) != m_graph->inputNodesMap.end(), + OPENVINO_ASSERT(m_graph->GetInputNodesMap().find(port_index) != m_graph->GetInputNodesMap().end(), "Tensor with index: ", port_index, " exists in CPU plugin graph, but absents in model inputs"); @@ -509,7 +509,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } if (type == ov::ISyncInferRequest::FoundPort::Type::OUTPUT) { - const auto& outMap = m_graph->outputNodesMap; + const auto& outMap = m_graph->GetOutputNodesMap(); auto output = outMap.find(port_index); OPENVINO_ASSERT(output != outMap.end(), "Tensor with index: ", diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index 9a3b9788b838d2..dcf2b0f8ffd5ee 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -938,7 +938,7 @@ int TensorIterator::getNumIteration(const std::vector& inputPortMap, co } bool TensorIterator::runAsDynamic() const { - return isDynamicNode() || Graph::Status::ReadyDynamic == sub_graph.getStatus(); + return isDynamicNode() || sub_graph.IsDynamic(); } bool TensorIterator::created() const { From cde0429991ef8746a7c73c36dd6afbd1bf9b2951 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Tue, 29 Oct 2024 20:21:02 +0800 Subject: [PATCH 083/233] [CPU] enable channel first format support for rank=3 Deconv to use amx fp16 kernel (#27085) ### Details: - *enable channel first format support for rank=3 Deconv to use amx fp16 kernel to benefit model performance on GNR, e.g. hifigan* - *fix acl_convert UNKNOWN DataLayout accuracy issues* ### Tickets: - *153089* --- src/plugins/intel_cpu/src/nodes/deconv.cpp | 6 ++++-- .../intel_cpu/src/nodes/executors/acl/acl_convert.cpp | 11 +++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 8a7f95268b4f3a..cb340afc029304 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -426,8 +426,10 @@ std::vector Deconvolution::getAvailableFormatsForDims(const else if (dims.getRank() == 2) return {memory::format_tag::nc}; else if (dims.getRank() == 3) - return {memory::format_tag::tnc, memory::format_tag::ntc, - memory::format_tag::ncw, memory::format_tag::nCw8c, memory::format_tag::nCw16c }; + return {memory::format_tag::ncw, + memory::format_tag::nCw8c, + memory::format_tag::nCw16c, + memory::format_tag::nwc}; else if (dims.getRank() == 4) return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp index 440af52749bc9c..1bc0585930387f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp @@ -27,12 +27,11 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams, if (!isCopyOp && dstPrecision == DataType::S8) { dstPrecision = DataType::QASYMM8_SIGNED; } - auto srcDims = srcDesc->getShape().getStaticDims(); - auto dstDims = dstDesc->getShape().getStaticDims(); - auto srcDataLayout = getAclDataLayoutByMemoryDesc(srcDesc); - auto dstDataLayout = getAclDataLayoutByMemoryDesc(dstDesc); - auto srcTensorInfo = TensorInfo(shapeCast(collapse_dims_to_max_rank(srcDims)), 1, srcPrecision, srcDataLayout); - auto dstTensorInfo = TensorInfo(shapeCast(collapse_dims_to_max_rank(dstDims)), 1, dstPrecision, dstDataLayout); + // Use 1D TensorInfo, since UNKNOWN DataLayout may have accuracy issues + auto srcDims1D = convertParams.size; + auto dstDims1D = convertParams.size; + auto srcTensorInfo = TensorInfo(TensorShape(srcDims1D), 1, srcPrecision); + auto dstTensorInfo = TensorInfo(TensorShape(dstDims1D), 1, dstPrecision); if (isCopyOp) { Status s = NECopy::validate(&srcTensorInfo, &dstTensorInfo); if (!s) { From 583925c5de910fe4e0e2729b215e513424b84f06 Mon Sep 17 00:00:00 2001 From: M Date: Tue, 29 Oct 2024 05:55:36 -0700 Subject: [PATCH 084/233] [CPU][ARM] Fix ARM tests failing because of overflow (#27074) ### Details: - Fixes ARM test overflow for Multiple Query SDP. --- .../src/nodes/kernels/scaled_attn/mha_single_token.cpp | 2 +- .../subgraph_tests/src/common/concat_multiple_query_sdp.cpp | 2 +- .../functional/shared_tests_instances/skip_tests_config.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 6b6df3c3181ee0..25ddbb1b4246b1 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -1148,7 +1148,7 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, past_v_scale_zp, head_sum); } else { - OPENVINO_THROW("Unsupported precision: ", query.get_precision()); + OPENVINO_THROW("Unsupported precision: ", present_key.get_precision()); } #else if (present_key.get_precision() == ov::element::u8) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp index d05e7840562191..d74ab99fb3d5ab 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp @@ -238,7 +238,7 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterfaceget_element_type() == element::f16) { ov::Tensor t{ov::element::f16, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + strided_iota(static_cast(t.data()), t.get_size(), val, 0.0f); inputs.insert({param, t}); } else { ov::Tensor t{ov::element::bf16, shape}; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index e7c006ab97427f..6edc4f062536d0 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -358,8 +358,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)"); // Issue: 141705 retVector.emplace_back(R"(.*smoke_arm_Deconv_2D_Planar_FP16/DeconvolutionLayerCPUTest.*INFERENCE_PRECISION_HINT=f16.*)"); - // Issue: 154882 - retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*u8.*)"); #endif #if defined(OPENVINO_ARCH_ARM) @@ -539,6 +538,7 @@ std::vector disabledTestPatterns() { // Skip fp16 tests for paltforms that don't support fp16 precision retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*Prc=f16.*)"); + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*HasShapeOf=1.*)"); } else { // Issue 117407 retVector.emplace_back( From 08c6672eda563aa737672487e605a3e55ff60143 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Tue, 29 Oct 2024 14:02:38 +0100 Subject: [PATCH 085/233] [TESTS] Disable lerp test for torch.export on older versions (#27302) ### Details: - *Disable `lerp_` test for `torch.export` on older versions of `torch`* ### Tickets: - *CVS-156278* Signed-off-by: Maxim Vafin --- tests/layer_tests/pytorch_tests/test_lerp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/layer_tests/pytorch_tests/test_lerp.py b/tests/layer_tests/pytorch_tests/test_lerp.py index 0f85fac8569c95..d689efb3c77252 100644 --- a/tests/layer_tests/pytorch_tests/test_lerp.py +++ b/tests/layer_tests/pytorch_tests/test_lerp.py @@ -4,6 +4,7 @@ import numpy as np import pytest import torch +from packaging import version from pytorch_layer_test_class import PytorchLayerTest, skip_if_export @@ -44,6 +45,9 @@ def forward2(self, lhs, rhs): @pytest.mark.precommit_fx_backend def test_lerp(self, ie_device, precision, ir_version, weight, input_shape_rhs, op_type): + if (op_type == "lerp_" and PytorchLayerTest.use_torch_export() and + version.parse(torch.__version__) < version.parse("2.5")): + pytest.skip("Not supported in PyTorch versions earlier than 2.5.") self.input_rhs = np.random.randn(*input_shape_rhs).astype(np.float32) if isinstance(weight, list): weight = torch.rand(weight) From 60e348b85c15f3e3a260555377b13aa6e8d5085c Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 14:25:07 +0100 Subject: [PATCH 086/233] [CPU] Introduce LoRA macro operation (#27110) ### Details: To minimize the overheads on the LoRA subgraph operation processing (shape update, memory allocation, etc.) it does make sense to merge such subgraphs into a specific LoRA macro operation, which may exploit some LoRA properties to optimize performance. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/cpu_types.cpp | 4 +- src/plugins/intel_cpu/src/cpu_types.h | 1 + src/plugins/intel_cpu/src/nodes/composite.cpp | 23 +- src/plugins/intel_cpu/src/nodes/input.cpp | 1 + src/plugins/intel_cpu/src/nodes/input.h | 12 +- src/plugins/intel_cpu/src/nodes/lora.cpp | 110 ++++++++ src/plugins/intel_cpu/src/nodes/lora.h | 41 +++ src/plugins/intel_cpu/src/nodes/reference.cpp | 2 +- src/plugins/intel_cpu/src/nodes_factory.cpp | 2 + .../transformation_pipeline.cpp | 2 + .../src/common/lora_pattern.cpp | 266 ++++++++++++++++++ 11 files changed, 453 insertions(+), 11 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/lora.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/lora.h create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 953f94cb3d5776..e20369c9cca215 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -256,7 +256,8 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"LLMMLP", Type::LLMMLP}, {"QKVProjection", Type::QKVProjection}, {"RMS", Type::RMS}, - {"SearchSorted", Type::SearchSorted} + {"SearchSorted", Type::SearchSorted}, + {"LoraSubgraph", Type::LoRA} }; return type_to_name_tbl; } @@ -389,6 +390,7 @@ std::string NameFromType(const Type type) { CASE(QKVProjection); CASE(RMS); CASE(SearchSorted); + CASE(LoRA); CASE(Unknown); } #undef CASE diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index c0a2acc3329a9c..d6ac9947a8fb5d 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -134,6 +134,7 @@ enum class Type { QKVProjection, RMS, SearchSorted, + LoRA }; enum class Algorithm { diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index b38a56649bd60a..a1ceabd6942db1 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -15,11 +15,23 @@ namespace intel_cpu { namespace node { bool Composite::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - return ov::is_type(op); + try { + if (!ov::is_type(op)) { + errorMessage = "Unknown SubGraph operation : " + std::string(op->get_type_info().name) + " with name '" + + op->get_friendly_name() + "'"; + } + } catch (...) { + return false; + } + return true; } Composite::Composite(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, InternalDynShapeInferFactory()) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + } const auto& subModel = ov::as_type_ptr(op); OPENVINO_ASSERT(subModel, "Attempt to create SubGraph node from an invalid op type: ", op); @@ -27,7 +39,7 @@ Composite::Composite(const std::shared_ptr& op, const GraphContext::CP } void Composite::selectOptimalPrimitiveDescriptor() { - // for the input configution, just always use the parent configuration + // for the input configuration, just always use the parent configuration std::vector inConfs; std::vector graphInputConfig; @@ -38,14 +50,14 @@ void Composite::selectOptimalPrimitiveDescriptor() { } std::vector graphOutputConfig; - for (size_t i = 0; i < getParentEdges().size(); i++) { + for (size_t i = 0; i < outputShapes.size(); i++) { graphOutputConfig.emplace_back(node::Input::OutputConfig{true, true}); } // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); - // for the output decriptors, use the configuration of the graph's output nodes + // for the output descriptors, use the configuration of the graph's output nodes auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); std::vector outConfs; @@ -89,9 +101,6 @@ void Composite::execute(dnnl::stream) { void Composite::executeDynamicImpl(dnnl::stream strm) { execute(strm); - if (!inputShapesModified()) - return; - // since the shape inference is not performed for the composite node // a memory of the extra child edges, attached to the output ports // has to be updated after an inference of the inner graph finished diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 4ee5707e0a9e76..1f650bd8c5de17 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -430,6 +430,7 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config) : Input(op, context) { + extMemDesc = config.desc; m_useParentMemoryDescForOutput = config.useParentMemoryDescForOutput; m_isInPlace = config.inPlace; } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index a954ce56665d61..4d7febb17ad4b7 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -19,9 +19,17 @@ class Input : public Node { }; struct OutputConfig { + OutputConfig() = default; + OutputConfig(bool useParentMemoryDesc_, bool inPlace_) + : useParentMemoryDescForOutput(useParentMemoryDesc_), + inPlace(inPlace_) {} + + OutputConfig(MemoryDescPtr desc_, bool inPlace_) : desc(std::move(desc_)), inPlace(inPlace_) {} + // @todo better to use memory desc with any layout and undefined precision - bool useParentMemoryDescForOutput; - bool inPlace; + MemoryDescPtr desc = nullptr; + bool useParentMemoryDescForOutput = false; + bool inPlace = false; }; Input(const std::shared_ptr& op, const GraphContext::CPtr context); diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp new file mode 100644 index 00000000000000..2c69bc347b6139 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lora.h" + +#include "nodes/input.h" +#include "cpu_memory.h" +#include "ov_ops/lora_subgraph.hpp" +#include "utils/debug_capabilities.h" +#include "shape_inference/shape_inference_pass_through.hpp" + +namespace ov { +namespace intel_cpu { +namespace node { + +bool LoRA::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + if (!ov::is_type(op)) { + errorMessage = "Unknown LoRA operation : " + std::string(op->get_type_info().name) + " with name '" + + op->get_friendly_name() + "'"; + } + } catch (...) { + return false; + } + return true; +} + +LoRA::LoRA(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, PassThroughShapeInferFactory()) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + } + const auto& loraModel = ov::as_type_ptr(op); + OPENVINO_ASSERT(loraModel, + "Attempt to create LoRA node from an invalid op type: ", + op, + " with name ", + op->get_friendly_name()); + + m_body = loraModel->get_function(); +} + +void LoRA::selectOptimalPrimitiveDescriptor() { + // for the input configuration, just always use the parent configuration + std::vector inConfs; + std::vector graphInputConfig; + + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); + inConfs.emplace_back(desc); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + } + + std::vector graphOutputConfig; + // enforce the same memory descriptor on the output as on the input to allow inPlace memory + graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), true}); + + // configure the inner graph to get the information about output memory descriptors + m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); + + // for the output descriptors, use the configuration of the graph's output nodes + auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); + + const auto& desc = outputDescriptors.front(); + + // just a sanity check + CPU_NODE_ASSERT(desc->isCompatible(*(inConfs.front().getMemDesc())), "Unexpected input/output descriptor mismatch"); + + std::vector outConfs; + + outConfs.emplace_back(desc, BlockedMemoryDesc::FULL_MASK, 0); // use the memory from the first input inPlace + + const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); +} + +// @todo add ascii diagram for memory mapping / reuse +void LoRA::createPrimitive() { + CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + std::vector inputMemory; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + inputMemory.emplace_back(getSrcMemoryAtPort(i)); + } + + CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + "Number of node outputs must be equal the number of inner graph's outputs"); + + std::vector outputMemory{getDstMemoryAtPort(0)}; + m_graph.Activate(inputMemory, outputMemory); +} + +void LoRA::execute(dnnl::stream) { + m_graph.Infer(); +} + +void LoRA::executeDynamicImpl(dnnl::stream strm) { + execute(strm); +} + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h new file mode 100644 index 00000000000000..89a1bc15c2bf17 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "graph.h" +#include "node.h" + +namespace ov { +namespace intel_cpu { +namespace node { + +class LoRA : public Node { +public: + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + LoRA(const std::shared_ptr& op, const GraphContext::CPtr& context); + + bool created() const override { + return getType() == Type::LoRA; + } + + bool needPrepareParams() const override { + return false; + } + + void getSupportedDescriptors() override{}; + void selectOptimalPrimitiveDescriptor() override; + void createPrimitive() override; + void execute(dnnl::stream) override; + void executeDynamicImpl(dnnl::stream strm) override; + +private: + std::shared_ptr m_body; + Graph m_graph; +}; + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp index 43b8f041184a70..185815acd8c294 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.cpp +++ b/src/plugins/intel_cpu/src/nodes/reference.cpp @@ -14,7 +14,7 @@ Reference::Reference(const std::shared_ptr& op, const GraphContext::CP Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)), ovCoreNode(op), additionalErrorMessage(errorMessage) { if (!op->has_evaluate()) { OPENVINO_THROW_NOT_IMPLEMENTED( - "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented"); + "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented)"); } setType(Type::Reference); diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index 16cf1b974d8561..4a8e8205510fcf 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -108,6 +108,7 @@ #include "nodes/transpose.h" #include "nodes/unique.hpp" #include "nodes/causal_mask_preprocess.h" +#include "nodes/lora.h" namespace ov { namespace intel_cpu { @@ -221,6 +222,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") { INTEL_CPU_NODE(Composite, Type::SubModel); INTEL_CPU_NODE(ScaledDotProductAttention, Type::ScaledDotProductAttention); INTEL_CPU_NODE(SearchSorted, Type::SearchSorted); + INTEL_CPU_NODE(LoRA, Type::LoRA); #if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize); INTEL_CPU_NODE(GridSample, Type::GridSample); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index fcf38440b8aa4b..9dd1da2d471e5a 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -37,6 +37,7 @@ #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" #include "transformations/control_flow/unroll_tensor_iterator.hpp" #include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp" #include "transformations/fp16_compression/mark_floatpoint_range.hpp" @@ -693,6 +694,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_REGISTER_PASS_COMMON(manager, ov::pass::EnableDecompressionConvertConstantFolding); CPU_REGISTER_PASS_COMMON(manager, ov::pass::KeepConstAndDecompression); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::LoraSubgraphFusion); manager.run_passes(model); } diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp new file mode 100644 index 00000000000000..4f4b05ef56750c --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp @@ -0,0 +1,266 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/convolution.hpp" +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" + +namespace ov { +namespace test { + +namespace { +constexpr auto t4_name = "lora/MatMul.B"; +constexpr auto t5_name = "lora/MatMul.alpha"; +constexpr auto t6_name = "lora/MatMul.A"; +constexpr auto netType = ov::element::f32; +} // namespace + +class LoraPatternBaseCPUTest : public SubgraphBaseTest { +protected: + void run_test_empty_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + + inferRequest.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4); + } + + void run_test_random_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, ov::test::utils::DEVICE_TEMPLATE); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr size_t lora_order = 25lu; + constexpr int infer_count = 6lu; + + std::unordered_map stateShapes; + std::unordered_map initStateShapes; + + auto&& states = inferRequest.query_state(); + for (auto&& state : states) { + auto shape = state.get_state().get_shape(); + initStateShapes.insert({state.get_name(), shape}); + std::for_each(shape.begin(), shape.end(), [=](ov::Shape::value_type& x) { + if (0 == x) { + x = lora_order; + } + }); + stateShapes.insert({state.get_name(), std::move(shape)}); + } + + for (int i = 0; i < infer_count; ++i) { + // set states + + if (i == 3) { + // reset states on the 3rd iteration + for (auto&& item : states) { + item.reset(); + } + + for (auto&& item : inferRequestRef.query_state()) { + // Template plugin doesn't support reset state for dynamic shape states + item.get_state().set_shape(initStateShapes.at(item.get_name())); + } + } else if (!(i & 0x1)) { // every even call + // generate and set state tensors + for (auto&& item : states) { + auto&& refStates = inferRequestRef.query_state(); + using ov::test::utils::InputGenerateData; + const auto& shape = stateShapes.at(item.get_name()); + auto tensor = + ov::test::utils::create_and_fill_tensor(netType, shape, InputGenerateData{0, 10, 1, i}); + item.set_state(tensor); + auto itr = std::find_if(refStates.begin(), refStates.end(), [&](const ov::VariableState& state) { + return state.get_name() == item.get_name(); + }); + ASSERT_FALSE(itr == refStates.end()); + itr->set_state(tensor); + } + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + + auto tx_result_ref = inferRequestRef.get_tensor(outputs[0]); + auto tz_result_ref = inferRequestRef.get_tensor(outputs[1]); + + ov::test::utils::compare(tx_result, tx_result_ref, 1e-4, 1e-4); + ov::test::utils::compare(tz_result, tz_result_ref, 1e-4, 1e-4); + } + } +}; + +class LoraPatternMatmulCPUTest : public LoraPatternBaseCPUTest { +public: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + ov::PartialShape shape_x = {-1, -1, K}; + ov::PartialShape shape_w = {N, K}; + + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + + // "Main" matrix multiplication from the original transformer model + auto tx = std::make_shared(param_y, param_w, false, true); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({N, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, K}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // Apply LoRA parameters to the current activations + auto t5810 = std::make_shared(param_y, t6, false, true); + auto t5811 = std::make_shared(t5810, t5); + auto t5812 = std::make_shared(t5811, t4, false, true); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t5812); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y, param_w})); + } + +protected: + static constexpr size_t K = 563ul; // Weights matrix K dimension + static constexpr size_t N = 2048ul; // Weights matrix N dimension +}; + +class LoraPatternConvolutionCPUTest : public LoraPatternBaseCPUTest { +public: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + ov::PartialShape shape_x = {-1, num_channels, -1, -1}; + + auto param_y = std::make_shared(netType, shape_x); + + // Original Convolution that is modified by LoRA adapter later + auto tx = ov::test::utils::make_convolution(param_y, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({num_channels, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, num_channels}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // LoRA pattern with additional Transposes to move channel dimensions into positions where MatMul can be applied + auto t4940 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + + auto t4941 = std::make_shared(param_y, t4940); + auto t4942 = std::make_shared(t4941, t6, false, true); + auto t4943 = std::make_shared(t4942, t5); + auto t4944 = std::make_shared(t4943, t4, false, true); + + auto t4945 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + auto t4946 = std::make_shared(t4944, t4945); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t4946); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y})); + } + +protected: + static constexpr size_t num_channels = 64ul; +}; + +TEST_F(LoraPatternMatmulCPUTest, smoke_LoRA_CPU_MatMul_empty) { + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_empty_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 1); +} + +TEST_F(LoraPatternConvolutionCPUTest, smoke_LoRA_CPU_Conv_empty) { + targetStaticShapes = {{{1, num_channels, 10, 15}}}; + run_test_empty_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 0); +} + +TEST_F(LoraPatternMatmulCPUTest, smoke_LoRA_CPU_MatMul_random) { + GTEST_SKIP(); + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_random_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 1); +} + +TEST_F(LoraPatternConvolutionCPUTest, smoke_LoRA_CPU_Conv_random) { + GTEST_SKIP(); + targetStaticShapes = {{{1, num_channels, 10, 15}}}; + run_test_random_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 0); +} + +} // namespace test +} // namespace ov \ No newline at end of file From 3ced1c18a365d3f7ed6232b457d66076c8536dda Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 29 Oct 2024 18:15:57 +0400 Subject: [PATCH 087/233] [TF FE] Update tensorflow-text version and fix jax version for MacOS x86 (#27295) **Details:** Update tensorflow-text version and fix jax version for MacOS x86 **Ticket:** 156277 --------- Signed-off-by: Kazantsev, Roman --- tests/requirements_tensorflow | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index 3ae47d81ee2c50..954bba7944245f 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -9,13 +9,14 @@ pytest-html==4.1.1 transformers==4.45.1 # install exact keras version since tensorflow depends and has no upper bound for it keras==3.6.0 -tensorflow==2.18.0; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") -tensorflow==2.17.0; python_version < "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") +tensorflow==2.18.0; platform_system != "Darwin" or platform_machine != "x86_64" tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64" # install explicit version of wrapt to avoid "this __dict__ descriptor does not support '_DictWrapper' objects" error from TensorFlow 2.18 wrapt==1.15.0; python_version >= "3.12" # tensorflow-text is not available for both Windows and ARM platforms -tensorflow-text==2.17.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" +tensorflow-text==2.18.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1 -jax==0.4.35 +jax==0.4.35; platform_system != "Darwin" or platform_machine != "x86_64" +# tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0 +jax==0.4.33; platform_system == "Darwin" and platform_machine == "x86_64" defusedxml==0.7.1 From 669537a21ed0809d75725d2e770c8218b9b6d308 Mon Sep 17 00:00:00 2001 From: Surya Siddharth Pemmaraju Date: Tue, 29 Oct 2024 09:21:50 -0700 Subject: [PATCH 088/233] Disabled regional compilation (#27289) ### Details: - Torch 2.5.0 enabled regional compilation by default which degrades the performance of openvino backend ### Tickets: - (https://jira.devtools.intel.com/browse/CVS-156251) --------- Co-authored-by: Maxim Vafin --- .../frontend/pytorch/torchdynamo/backend.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py index 8294927a079c7e..9f2ef019769875 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py @@ -49,6 +49,9 @@ openvino_options = {} +# Disable regional compilation which was enabled by default from Torch 2.5.0 +if hasattr(torch._dynamo.config, "inline_inbuilt_nn_modules"): + torch._dynamo.config.inline_inbuilt_nn_modules=False @fake_tensor_unsupported def openvino(subgraph, example_inputs, options=None): @@ -59,15 +62,8 @@ def openvino(subgraph, example_inputs, options=None): return aot_autograd(fw_compiler=fx_openvino, bw_compiler=fx_openvino, decompositions=get_decompositions(decompositions))(subgraph, example_inputs) return fx_openvino(subgraph, example_inputs, options) - -try: - from packaging import version - - if version.parse(torch.__version__) < version.parse("2.5.0"): - register_backend(compiler_fn=openvino, name="openvino") -except ImportError: - logger.warning("The 'packaging' module is required but not installed") - +if "openvino" not in torch.compiler.list_backends(): + register_backend(compiler_fn=openvino, name="openvino") def fx_openvino(subgraph, example_inputs, options=None): try: From 9235543beb6f214cbb2857e99ddccb1eb2970451 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 29 Oct 2024 18:40:00 +0100 Subject: [PATCH 089/233] [def/transformations]: Fix for failing roblox model at ConvertPrecision transformation (#27298) ### Details: - Fix for failing roblox model on ConvertPrecision transformation. ### Tickets: - CVS-156058 --------- Co-authored-by: Andrii Staikov Co-authored-by: Michal Lukaszewski --- .../src/transformations/convert_precision.cpp | 17 +++++++++++++- .../tests/utils/convert_precision.cpp | 23 +++++++++++++++++++ .../include/openvino/op/search_sorted.hpp | 19 ++++++++++++++- src/core/src/op/search_sorted.cpp | 17 ++++++++++---- src/core/tests/visitors/op/sorted_search.cpp | 2 +- 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index c34e91f835301a..3ab2c694be40ef 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -62,6 +62,8 @@ bool fuse_type_to_ctc_greedy_decoder_seq_len(const std::shared_ptr& no bool fuse_type_to_random_uniform_v8(const std::shared_ptr& node, const precisions_map& precisions); +bool fuse_type_to_search_sorted_v15(const std::shared_ptr& node, const precisions_map& precisions); + bool extend_select_type(const std::shared_ptr& node, const precisions_map& precisions); bool extend_reverse_type(const std::shared_ptr& node, const precisions_map& precisions); @@ -468,7 +470,8 @@ bool ov::pass::ConvertPrecision::run_on_model(const std::shared_ptr& {ov::op::v13::Multinomial::get_type_info_static(), fuse_type_to_multinomial_v13}, {ov::op::v0::PriorBox::get_type_info_static(), fuse_type_to_prior_box}, {ov::op::v8::PriorBox::get_type_info_static(), fuse_type_to_prior_box}, - {ov::op::v0::PriorBoxClustered::get_type_info_static(), fuse_type_to_prior_box}}; + {ov::op::v0::PriorBoxClustered::get_type_info_static(), fuse_type_to_prior_box}, + {ov::op::v15::SearchSorted::get_type_info_static(), fuse_type_to_search_sorted_v15}}; for (const auto& it : m_additional_type_to_fuse_map) { type_to_fuse[it.first] = it.second; @@ -553,6 +556,18 @@ bool fuse_type_to_unique_v10(const std::shared_ptr& node, const precisions return res; } +bool fuse_type_to_search_sorted_v15(const std::shared_ptr& node, const precisions_map& precisions) { + bool res = false; + if (auto op = ov::as_type_ptr(node)) { + auto it = precisions.find(node->get_output_element_type(0)); + if (it != precisions.end()) { + op->set_output_type_attr(it->second); + res = true; + } + } + return res; +} + bool fuse_type_to_range_v4(const std::shared_ptr& node, const precisions_map& precisions) { auto it = precisions.find(node->get_output_element_type(0)); if (it == precisions.end()) diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp index 9554cf09162d45..2aa4d4d2fac9e9 100644 --- a/src/common/transformations/tests/utils/convert_precision.cpp +++ b/src/common/transformations/tests/utils/convert_precision.cpp @@ -15,6 +15,7 @@ #include "openvino/core/model.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/opsets/opset10.hpp" +#include "openvino/opsets/opset15.hpp" #include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset4.hpp" #include "openvino/opsets/opset5.hpp" @@ -1036,6 +1037,28 @@ TEST(TransformationTests, ConvertPrecision_TypeRelaxed) { } } +TEST(TransformationTests, ConvertPrecision_SearchSorted) { + std::shared_ptr f(nullptr); + { + auto search_sorted_input = opset15::Constant::create(ov::element::i64, {5}, {1, 2, 3, 4, 5}); + auto indices = std::make_shared(ov::element::i64, Shape{3}); + auto search_sorted = std::make_shared(search_sorted_input, indices); + + auto less_input = opset15::Constant::create(ov::element::i64, {3}, {4, 5, 6}); + auto less = std::make_shared(search_sorted, less_input); + + f = std::make_shared(OutputVector{less}, ParameterVector{indices}); + + pass::Manager manager; + manager.register_pass(); + manager.register_pass(precisions_map{{element::i64, element::i32}}); + manager.run_passes(f); + } + OV_ASSERT_NO_THROW(check_rt_info(f)); + ASSERT_FALSE(has_type(f)); + ASSERT_TRUE(has_type(f)); +} + TEST(TransformationTests, ConvertPrecision_Variables) { std::shared_ptr f(nullptr); { diff --git a/src/core/include/openvino/op/search_sorted.hpp b/src/core/include/openvino/op/search_sorted.hpp index c370ba46b2f182..efb1f8491e0882 100644 --- a/src/core/include/openvino/op/search_sorted.hpp +++ b/src/core/include/openvino/op/search_sorted.hpp @@ -22,7 +22,15 @@ class OPENVINO_API SearchSorted : public Op { /// \param values Values to search indexs for. /// \param right_mode If False, return the first suitable index that is found for given value. If True, return /// the last such index. - SearchSorted(const Output& sorted_sequence, const Output& values, bool right_mode = false); + /// \param output_type The element type of the output tensor. This is purely an implementation flag, which + /// is used to convert the output type for CPU plugin in ConvertPrecision transformation (and potentially other + /// plugins as well). Setting this flag to element::i32 will result in the output tensor of i32 element type. + /// Setting this flag to element::i64 will generally not give any effect, since it will be converted to i32 anyway, + /// at least for CPU plugin. + SearchSorted(const Output& sorted_sequence, + const Output& values, + bool right_mode = false, + const element::Type& output_type = element::i64); void validate_and_infer_types() override; bool visit_attributes(AttributeVisitor& visitor) override; @@ -36,8 +44,17 @@ class OPENVINO_API SearchSorted : public Op { m_right_mode = right_mode; } + void set_output_type_attr(const element::Type& output_type) { + m_output_type = output_type; + } + + element::Type get_output_type_attr() const { + return m_output_type; + } + private: bool m_right_mode{}; + element::Type m_output_type = element::i64; }; } // namespace v15 } // namespace op diff --git a/src/core/src/op/search_sorted.cpp b/src/core/src/op/search_sorted.cpp index 8b9bb012b27106..65b5ff31861d8e 100644 --- a/src/core/src/op/search_sorted.cpp +++ b/src/core/src/op/search_sorted.cpp @@ -12,9 +12,13 @@ namespace ov { namespace op { namespace v15 { -SearchSorted::SearchSorted(const Output& sorted_sequence, const Output& values, bool right_mode) +SearchSorted::SearchSorted(const Output& sorted_sequence, + const Output& values, + bool right_mode, + const element::Type& output_type) : Op({sorted_sequence, values}), - m_right_mode(right_mode) { + m_right_mode(right_mode), + m_output_type(output_type) { constructor_validate_and_infer_types(); } @@ -23,20 +27,25 @@ void SearchSorted::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, get_input_element_type(0).compatible(get_input_element_type(1)), "Sorted sequence and values must have the same element type."); + NODE_VALIDATION_CHECK(this, + m_output_type == element::i32 || m_output_type == element::i64, + "The element type of the last output can only be set to i32 or i64."); + const auto& output_shapes = shape_infer(this, ov::util::get_node_input_partial_shapes(*this)); - set_output_type(0, ov::element::i64, output_shapes[0]); + set_output_type(0, m_output_type, output_shapes[0]); } bool SearchSorted::visit_attributes(AttributeVisitor& visitor) { OV_OP_SCOPE(v15_SearchSorted_visit_attributes); visitor.on_attribute("right_mode", m_right_mode); + visitor.on_attribute("output_type", m_output_type); return true; } std::shared_ptr SearchSorted::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v15_SearchSorted_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), new_args.at(1), get_right_mode()); + return std::make_shared(new_args.at(0), new_args.at(1), get_right_mode(), get_output_type_attr()); } } // namespace v15 } // namespace op diff --git a/src/core/tests/visitors/op/sorted_search.cpp b/src/core/tests/visitors/op/sorted_search.cpp index 860c9528d0e9aa..10d544527f3714 100644 --- a/src/core/tests/visitors/op/sorted_search.cpp +++ b/src/core/tests/visitors/op/sorted_search.cpp @@ -22,7 +22,7 @@ TEST(attributes, search_sorted_op) { auto g_op = ov::as_type_ptr(builder.create()); // attribute count - const auto expected_attr_count = 1; + const auto expected_attr_count = 2; EXPECT_EQ(builder.get_value_map_size(), expected_attr_count); // space_to_depth attributes From 99f3a91f2ad4347c660a3103a2aca0748323a58f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 29 Oct 2024 21:50:32 +0400 Subject: [PATCH 090/233] [Wheel] return back JAX FE (#27309) ### Details: - Regression after https://github.com/openvinotoolkit/openvino/pull/26610 ### Tickets: - CVS-156317 --- cmake/developer_package/frontends/frontends.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/developer_package/frontends/frontends.cmake b/cmake/developer_package/frontends/frontends.cmake index d2aa0410476245..0815297a11a5eb 100644 --- a/cmake/developer_package/frontends/frontends.cmake +++ b/cmake/developer_package/frontends/frontends.cmake @@ -304,6 +304,9 @@ macro(ov_add_frontend) # then we need to mark it to be CXX ABI free ov_abi_free_target(${TARGET_NAME}) + # public target name + set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME frontend::${OV_FRONTEND_NAME}) + # installation if(NOT OV_FRONTEND_SKIP_INSTALL) @@ -351,9 +354,6 @@ macro(ov_add_frontend) COMPONENT ${dev_component} ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL} FILES_MATCHING PATTERN "*.hpp") - - # public target name - set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME frontend::${OV_FRONTEND_NAME}) endif() else() # skipped frontend has to be installed in static libraries case From fc105b06be4fb554f0cb075534cb4f1baa95605d Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Tue, 29 Oct 2024 19:14:14 +0100 Subject: [PATCH 091/233] [STFT][Op][Python] Fix STFT Python API to pass attribute (#27311) ### Details: - Fix STFT Python API to pass "transpose_frames" attribute ### Tickets: - 147160 --- .../python/src/openvino/runtime/opset15/ops.py | 2 +- src/bindings/python/tests/test_graph/test_create_op.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py index 45b01a11bc3588..b3a131602af703 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py @@ -326,7 +326,7 @@ def stft( :return: The new node performing STFT operation. """ inputs = as_nodes(data, window, frame_size, frame_step, name=name) - return _get_node_factory_opset15().create("STFT", inputs) + return _get_node_factory_opset15().create("STFT", inputs, {"transpose_frames": transpose_frames}) @nameable_op diff --git a/src/bindings/python/tests/test_graph/test_create_op.py b/src/bindings/python/tests/test_graph/test_create_op.py index 87787e1e29bc32..98d0ec3583882c 100644 --- a/src/bindings/python/tests/test_graph/test_create_op.py +++ b/src/bindings/python/tests/test_graph/test_create_op.py @@ -2492,8 +2492,8 @@ def test_stft(): window = ov.parameter([7], name="window", dtype=np.float32) frame_size = ov.constant(np.array(11, dtype=np.int32)) frame_step = ov.constant(np.array(3, dtype=np.int32)) - transpose_frames = True + transpose_frames = False op = ov_opset15.stft(data, window, frame_size, frame_step, transpose_frames) assert op.get_type_name() == "STFT" @@ -2501,6 +2501,14 @@ def test_stft(): assert op.get_output_element_type(0) == Type.f32 assert op.get_output_shape(0) == [4, 13, 6, 2] + transpose_frames = True + op = ov_opset15.stft(data, window, frame_size, frame_step, transpose_frames) + + assert op.get_type_name() == "STFT" + assert op.get_output_size() == 1 + assert op.get_output_element_type(0) == Type.f32 + assert op.get_output_shape(0) == [4, 6, 13, 2] + def test_search_sorted(): sorted_sequence = ov.parameter([7, 256, 200, 200], name="sorted", dtype=np.float32) From bd6cf01d4fa0bf6ecc250b8293fba08a518c2805 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Tue, 29 Oct 2024 11:16:15 -0700 Subject: [PATCH 092/233] [GPU] Fix sdpa opt accuracy (#27262) ### Details: - Fix accuracy for sdpa_opt ### Tickets: - 154583 --- .../src/kernel_selector/cl_kernels/sdpa_opt.cl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 8e6be800f37cf0..c114332f393c0e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -190,7 +190,7 @@ KERNEL(sdpa_opt)( // SLM for query inputs __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; // SLM for intermediate QK results - __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + __local SOFTMAX_ACCUMULATOR_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -259,7 +259,7 @@ KERNEL(sdpa_opt)( uint key_offset = INPUT1_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, 0); #endif - INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; + SOFTMAX_ACCUMULATOR_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; #if IS_KV_COMPRESSED const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); @@ -294,7 +294,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -326,7 +326,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -358,7 +358,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -389,7 +389,7 @@ KERNEL(sdpa_opt)( query_vals_reg = query_local[query_offset + i * SUBGROUP_SIZE]; } - acc[seq_idx] = mad(query_vals_reg, key_vals, acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals), acc[seq_idx]); query_offset += HEAD_SIZE; } } @@ -405,7 +405,7 @@ KERNEL(sdpa_opt)( // Wait until all SG finishes their calculations and apply scale and attention mask to the results barrier(CLK_LOCAL_MEM_FENCE); - INPUT0_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + SOFTMAX_ACCUMULATOR_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { // Iterate over all values QK values in SLM and apply scale and attention mask From 4d29e2ecc959d82fb4c2fc8b1fd974f39e9f1501 Mon Sep 17 00:00:00 2001 From: Septimiu Neaga <111509085+SeptimiuIoachimNeagaIntel@users.noreply.github.com> Date: Tue, 29 Oct 2024 21:09:11 +0200 Subject: [PATCH 093/233] Enable - disabling ORT optimizations flag in protopipe app (#27182) ### Details: - Enable - disabling ORT optimizations flag - in protopipe app Co-authored-by: Maksim Doronin --- src/plugins/intel_npu/tools/protopipe/README.md | 1 + src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp | 3 +++ .../intel_npu/tools/protopipe/src/scenario/inference.hpp | 1 + .../intel_npu/tools/protopipe/src/simulation/simulation.cpp | 3 +++ 4 files changed, 8 insertions(+) diff --git a/src/plugins/intel_npu/tools/protopipe/README.md b/src/plugins/intel_npu/tools/protopipe/README.md index afe6e8cffbc8c3..00849ad8bddc9a 100644 --- a/src/plugins/intel_npu/tools/protopipe/README.md +++ b/src/plugins/intel_npu/tools/protopipe/README.md @@ -97,6 +97,7 @@ The dependency graph in Protopipe is specified by: - `tag` - **Required**. The unique name of operation. - `type` - **Optional**. The operation type: _Infer_, _CPU_, _Compound_ (**Default**: _Infer_) - `repeat_count` - **Optional**. Runs operation over specified number of iterations. + - `opt_level` - **Optional**. Configures optimization level for ONNX Runtime. - `connections` - The list of connections between operations. Supported operation types diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp index 34099d36a69fdb..c2a1bd6415d595 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp +++ b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp @@ -404,6 +404,9 @@ struct convert { if (node["ep"]) { params.ep = node["ep"].as(); } + if (node["opt_level"]) { + params.opt_level = node["opt_level"].as(); + } return true; } }; diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp index c4fd85aa26721a..e4568c671438bc 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp +++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp @@ -104,6 +104,7 @@ struct ONNXRTParams { }; // NB: std::monostate stands for the default MLAS Execution provider using EP = std::variant; + std::optional opt_level; EP ep; }; diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp index 52f57c2881a3b6..5b1743651b6ef1 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp +++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp @@ -79,6 +79,9 @@ static void cfgExecutionProvider(cv::gapi::onnx::Params& netw static cv::gapi::GNetPackage getNetPackage(const std::string& tag, const ONNXRTParams& params) { cv::gapi::onnx::Params network{tag, params.model_path}; network.cfgSessionOptions(params.session_options); + if (params.opt_level.has_value()) { + network.cfgOptLevel(params.opt_level.value()); + } cfgExecutionProvider(network, params.ep); return cv::gapi::networks(network); } From 874bf8a120359ff4fc0b4e934de0b3e251835425 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Tue, 29 Oct 2024 23:23:16 +0000 Subject: [PATCH 094/233] [NPUW] Fix optimized out check (#27313) --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 4a9a3e06a0aa16..0070e6be2d2041 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -378,7 +378,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[i]; - if (!comp_model_desc.compiled_model || !comp_model_desc.replaced_by) { + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { continue; } const auto real_idx = comp_model_desc.replaced_by.value(); From 95a6f183d11286c4296c122c221f1faa4b3d9b06 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 30 Oct 2024 08:00:22 +0400 Subject: [PATCH 095/233] Revert "[GPU] Fixes for hybrid quantization (#27127)" (#27308) This reverts commit c21f572cc45193232d76aa21e821e92445b18725. Signed-off-by: Vladimir Paramuzov --- .../impls/onednn/fully_connected_onednn.hpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 59 ++++--------------- 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index 39423980521042..f4495fb5dd1645 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool compressed_case = fc_prim->compressed_weights && one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && - one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); + one_of(out_dt, {data_types::f16, data_types::f32}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 770aa387da8a60..305e21a5000149 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -15,11 +15,8 @@ #include "intel_gpu/plugin/transformations_pipeline.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" -#include "low_precision/add.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" -#include "low_precision/fold_convert.hpp" -#include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" @@ -28,9 +25,7 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" -#include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" -#include "low_precision/transpose.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -51,7 +46,6 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/rnn_cell.hpp" #include "openvino/op/rnn_sequence.hpp" -#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -319,9 +313,13 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression // types are not supported by oneDNN) manager.register_pass(supported_woq_types, !device_info.supports_immad); - pass_config->set_callback([&](const std::shared_ptr node) { - return !is_decompression_multiply(node); - }); + + // Need to check if transformations work correctly for mixed models with both compression and quantization at the same time. + if (!is_model_quantized) { + pass_config->set_callback([&](const std::shared_ptr node) { + return !is_decompression_multiply(node); + }); + } const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -690,6 +688,12 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lptPassConfig = lptManager.get_pass_config(); // quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation lptPassConfig->disable(); + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + if (const auto mulitply = std::dynamic_pointer_cast(node)) { + return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply); + } + return false; + }); lptPassConfig->set_callback([func, defaultPrecisions](const_node_ptr& node) -> bool { auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool { const auto rank = shape.rank(); @@ -726,43 +730,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { - for (auto& user : node->get_users()) { - if (ov::is_type(user)) - return true; - } - - return false; - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::is_type(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node); - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::marked_as_bias(node); - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - return false; - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - if (ov::is_type(node)) { - return ov::is_type(node) && is_decompression_multiply(node); - } else if (ov::is_type(node)) { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - } - return false; - }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { // disable MultiplyToGroupConvolution if Multiply with Constant can be fused From 961b891ea423427e74b47d24629b33efd866c793 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 14:37:18 +0900 Subject: [PATCH 096/233] [GPU] update onednn (#27322) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 32ad05ab263b78..062d247e7853b1 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 32ad05ab263b782d4a4455ea85f5de009cf607c4 +Subproject commit 062d247e7853b14ed287a130cc2dc221187430aa From f12f086d4185765b06f2a659e0d01406b31fd634 Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Wed, 30 Oct 2024 15:05:57 +0900 Subject: [PATCH 097/233] [GPU] Add per layer scaling for FC to fix accuracy issue regarding fp16 overflow (#27291) ### Details: - Fix LLM accuracy issue due to fp16 overflow when using decompression_post_opt in fully_connected_gpu_bf_tiled_opt kernel - In the fc kernel, to optimize grouped scale, we calculate acc first as mad of activation (fp16) * weight (int4) first , and then apply scale value. This can cause accuracy issue when only multiply of activation and weight overflows. - In this case we can resolve the issue by applying scale down to the activation. - Implement per layer scaling for FCs ### Tickets: - 154583 --------- Signed-off-by: Andrew Park --- .../runtime/properties/hint/__init__.py | 1 + .../pyopenvino/core/properties/properties.cpp | 1 + .../tests/test_runtime/test_properties.py | 5 + .../include/openvino/runtime/properties.hpp | 6 + .../intel_gpu/src/plugin/compiled_model.cpp | 1 + src/plugins/intel_gpu/src/plugin/plugin.cpp | 4 +- .../transformations/fc_per_layer_scaling.cpp | 81 ++++++++++++ .../transformations/fc_per_layer_scaling.hpp | 19 +++ .../src/plugin/transformations_pipeline.cpp | 2 + .../src/runtime/execution_config.cpp | 1 + .../fc_per_layer_scaling_test.cpp | 117 ++++++++++++++++++ 11 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py index dd90ded374ca11..d1dce289d09941 100644 --- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py @@ -23,3 +23,4 @@ from openvino._pyopenvino.properties.hint import allow_auto_batching from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size from openvino._pyopenvino.properties.hint import kv_cache_precision +from openvino._pyopenvino.properties.hint import activations_scale_factor diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index a6b30bd773001f..564e5f69f5ee14 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -101,6 +101,7 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); + wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); // Submodule intel_cpu py::module m_intel_cpu = diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 32eb48f6765f41..6065d72196b44b 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -335,6 +335,11 @@ def test_properties_ro(ov_property_ro, expected_value): ((64, 64),), ), (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)), + ( + hints.activations_scale_factor, + "ACTIVATIONS_SCALE_FACTOR", + ((0.0, 0.0),), + ), ( intel_cpu.denormals_optimization, "CPU_DENORMALS_OPTIMIZATION", diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 627314748bbe9c..5674c75dd546d7 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -580,6 +580,12 @@ static constexpr Property dynamic_quantization */ static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; +/** + * @brief This property scales down activations to prevent overflows when inference precision is f16. + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property activations_scale_factor{"ACTIVATIONS_SCALE_FACTOR"}; + } // namespace hint /** diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 15ff4447b4bafe..233bc97c249cd4 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -257,6 +257,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RO}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RO}, ov::PropertyName{ov::execution_devices.name(), PropertyMutability::RO}, }; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 9aba7ee1a117eb..d3d70ec92cd23c 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -540,6 +540,7 @@ std::vector Plugin::get_caching_properties() const { ov::PropertyName{ov::hint::execution_mode.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::performance_mode.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW}, }; return caching_properties; @@ -585,7 +586,8 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RW}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW} + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW} }; return supported_properties; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp new file mode 100644 index 00000000000000..618578919d4024 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fc_per_layer_scaling.hpp" + +#include "intel_gpu/op/fully_connected_compressed.hpp" +#include "intel_gpu/op/placeholder.hpp" + +#include "openvino/op/multiply.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +FullyConnectedPerLayerScaling::FullyConnectedPerLayerScaling(float scale_factor) { + using namespace ov::pass::pattern; + + auto data_m = any_input(); + auto weights_m = any_input(); + auto bias_m = any_input(); + auto fc_compressed_wo_zp_m = wrap_type({data_m, weights_m, bias_m, any_input()}, consumers_count(1)); + auto fc_compressed_w_zp_m = wrap_type({data_m, weights_m, bias_m, any_input(), any_input()}, consumers_count(1)); + auto fc_compressed_m = std::make_shared(OutputVector{fc_compressed_wo_zp_m, fc_compressed_w_zp_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { + if (scale_factor == 0.f || scale_factor == 1.f) + return false; + auto fc = std::dynamic_pointer_cast(m.get_match_root()); + if (!fc || transformation_callback(fc)) + return false; + + const auto& pattern_map = m.get_pattern_value_map(); + const auto& data = pattern_map.at(data_m).get_node_shared_ptr(); + const auto& bias = pattern_map.at(bias_m).get_node_shared_ptr(); + + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {(1.f / scale_factor)}; + std::vector scale_up_value = {scale_factor}; + std::shared_ptr scale_down_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + std::shared_ptr scale_down_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); + std::shared_ptr scale_up_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + + std::shared_ptr scale_down_const = (data->get_element_type() == ov::element::f16) ? scale_down_const_f16 : scale_down_const_f32; + auto scale_down = std::make_shared(data, scale_down_const); + scale_down->set_friendly_name(fc->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(fc, scale_down); + fc->input(0).replace_source_output(scale_down); + + // If FC has bias as input, scaling must be applied to bias as well + if (!std::dynamic_pointer_cast(bias)) { + std::shared_ptr bias_scale_down_const = (bias->get_element_type() == ov::element::f16) ? scale_down_const_f16 : scale_down_const_f32; + auto bias_scale_down = std::make_shared(bias, bias_scale_down_const); + bias_scale_down->set_friendly_name(fc->get_friendly_name() + "_bias_scale_down"); + ov::copy_runtime_info(fc, bias_scale_down); + fc->input(2).replace_source_output(bias_scale_down); + } + + auto target_inputs = fc->get_output_target_inputs(0); + std::shared_ptr scale_up_const = (fc->get_element_type() == ov::element::f16) ? scale_up_const_f16 : scale_up_const_f32; + auto scale_up = std::make_shared(fc, scale_up_const); + scale_up->set_friendly_name(fc->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(fc, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } + + return true; + }; + + auto m = std::make_shared(fc_compressed_m, "FullyConnectedPerLayerScaling"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp new file mode 100644 index 00000000000000..5c0d7d07f5b411 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class FullyConnectedPerLayerScaling: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("FullyConnectedPerLayerScaling", "0"); + FullyConnectedPerLayerScaling(float scale_factor); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 305e21a5000149..a33a15fbbe6a1a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -65,6 +65,7 @@ #include "plugin/transformations/move_fc_reshape_to_weights.hpp" #include "plugin/transformations/bcast_and_pad_zp_buffers.hpp" #include "plugin/transformations/print_model_statistics.hpp" +#include "plugin/transformations/fc_per_layer_scaling.hpp" #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" @@ -846,6 +847,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); + manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); if (!device_info.supports_immad) { manager.register_pass(); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index c48f3f02fa9f6a..f3b9058f7ebdc8 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -61,6 +61,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), + std::make_tuple(ov::hint::activations_scale_factor, 0.f), // Legacy API properties std::make_tuple(ov::intel_gpu::nv12_two_inputs, false), diff --git a/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp new file mode 100644 index 00000000000000..2d2f21b57d7152 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" + +#include +#include + +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/manager.hpp" + +#include +#include "plugin/transformations/fc_per_layer_scaling.hpp" +#include "intel_gpu/op/placeholder.hpp" +#include "intel_gpu/op/fully_connected_compressed.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest1) { + float scale_factor = 2.f; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, no_bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto scale_down = std::make_shared(input, scale_down_const); + auto fc_compressed = std::make_shared(scale_down, weights_const, no_bias, scale_const, zp_const); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto scale_up = std::make_shared(fc_compressed, scale_up_const); + auto convert = std::make_shared(scale_up, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest2) { + float scale_factor = 2.f; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto bias = std::make_shared(ov::element::f16, ov::Shape{ 1, 32 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto bias = std::make_shared(ov::element::f16, ov::Shape{ 1, 32 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto scale_down = std::make_shared(input, scale_down_const); + auto bias_scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto bias_scale_down = std::make_shared(bias, scale_down_const); + auto fc_compressed = std::make_shared(scale_down, weights_const, bias_scale_down, scale_const, zp_const); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto scale_up = std::make_shared(fc_compressed, scale_up_const); + auto convert = std::make_shared(scale_up, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest3) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, no_bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(1.f); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov \ No newline at end of file From 9036b592d36a40808a8c9e0ab22a9dbb75e33cfc Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Wed, 30 Oct 2024 10:20:10 +0400 Subject: [PATCH 098/233] [CPU][OMP] Handle exception outside parallel region (#27303) ### Details: - *Handle exception inside OMP threads to avoid immediate program interruption.* ### Tickets: - *152606* Co-authored-by: Ilya Lavrenov --- src/plugins/intel_cpu/src/graph.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index f9bfa9334eae8f..f3f3a379fc2af7 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -1297,23 +1297,40 @@ class UpdateNodes : public UpdateNodesBase { if (origin_nested_levels < 2) { set_max_nested_levels(2); } + // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by the same thread. + // Therefore, need to pass the error message and throw a new exception outside the parallel region. + const char* what = nullptr; #pragma omp parallel #pragma omp sections { #pragma omp section { - updateDynParams(startCounter, stopIndx); + try { + updateDynParams(startCounter, stopIndx); + } catch (std::exception& e) { + what = e.what(); + } catch (...) { + what = "[ CPU ] Could not update dynamic parameters."; + } } #pragma omp section { - updateShapes(startCounter, stopIndx); + try { + updateShapes(startCounter, stopIndx); + } catch (std::exception& e) { + what = e.what(); + } catch (...) { + what = "[ CPU ] Could not update shapes."; + } } } if (origin_nested_levels != 2) { set_max_nested_levels(origin_nested_levels); } + + OPENVINO_ASSERT(what == nullptr, what); } }; #endif From 8da8a300994a3b3c52dd7a0d7c75e4db74f845f2 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 15:26:17 +0900 Subject: [PATCH 099/233] [GPU] model cache fix from kv cache compression (#27323) ### Details: - model cache was not working because of load/save mismatch --- .../intel_gpu/primitives/scaled_dot_product_attention.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index 1fd5b43824d0a7..77e1c5ae71099e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -116,6 +116,7 @@ struct scaled_dot_product_attention : public primitive_base::save(ob); ob << is_causal; + ob << is_kv_compressed; ob << has_attn_mask_input; ob << has_scale_input; ob << indirect_axis; @@ -123,7 +124,6 @@ struct scaled_dot_product_attention : public primitive_base Date: Wed, 30 Oct 2024 10:14:58 +0200 Subject: [PATCH 100/233] [intel-npu] max memalloc quickfix for grext 1.8 windows drivers (#27317) ### Details: - another quickfix for maximum memory allocation property, to enable UD44 windows drivers too - addition to https://github.com/openvinotoolkit/openvino/pull/27270 ### Tickets: - *EISW-143246* --- src/plugins/intel_npu/src/backend/src/zero_device.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 439b5fbd59f4f9..58bcd0eb7cc944 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -175,6 +175,14 @@ uint64_t ZeroDevice::getTotalMemSize() const { // we are safe here, can return the value directly from driver return query.total; } +#if defined(_WIN32) || defined(__CYGWIN__) + // Special case for windows drivers with graph_extension v 1.8 + if (_initStructs->isExtensionSupported(std::string("ZE_extension_graph_1_8"), ZE_MAKE_VERSION(1, 8))) { + // query here returns total system memory in KB, which we need to + // divide by 2 (OS limitation) and convert to bytes + return (query.total << 9); + } +#endif // Default for older drivers: return 2GB return LEGACY_MAX_MEM_ALLOC_SIZE_BYTES; From 9263641442f513758f7f7d6a772f598f888f20b2 Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Wed, 30 Oct 2024 09:25:14 +0100 Subject: [PATCH 101/233] [DOCS] Fixing formatting in the STFT article. (#27312) Fixing formatting issues in the `Short Time Fourier Transformation for real-valued input` article. --- .../operation-specs/signals/stft-15.rst | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst index 581c5062f67520..bcc420f5db25c9 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst @@ -31,25 +31,25 @@ Short Time Fourier Transformation for real-valued input (STFT) **Inputs** -* **1**: ``signal`` - Tensor of type *T* and 1D shape [signal_size] or 2D shape [batch, signal_size] with signal data for the STFT. **Required.** -* **2**: ``window`` - Tensor of type *T* and 1D shape [window_length], specifying the window values for the signal slice multiplication. **Required.** -* **3**: ``frame_size`` - Scalar tensor of type *T_INT* describing the size of a single frame of the signal to be provided as input to FFT. **Required.** -* **4**: ``frame_step`` - Scalar tensor of type *T_INT* describing The distance (number of samples) between successive frames. **Required.** +* **1**: ``signal`` - Tensor of type *T* and 1D shape [signal_size] or 2D shape [batch, signal_size] with signal data for the STFT. **Required.** +* **2**: ``window`` - Tensor of type *T* and 1D shape [window_length], specifying the window values for the signal slice multiplication. **Required.** +* **3**: ``frame_size`` - Scalar tensor of type *T_INT* describing the size of a single frame of the signal to be provided as input to FFT. **Required.** +* **4**: ``frame_step`` - Scalar tensor of type *T_INT* describing The distance (number of samples) between successive frames. **Required.** **Outputs** -* **1**: The result of STFT operation, tensor of the same type as input ``signal`` tensor and shape: +* **1**: The result of STFT operation, tensor of the same type as input ``signal`` tensor and shape: - + When ``transpose_frames == false`` the output shape is ``[frames, fft_results, 2]`` for 1D signal input or [batch, frames, fft_results, 2] for 2D signal input. - + When ``transpose_frames == true`` the output shape is [fft_results, frames, 2] for 1D signal input or [batch, fft_results, frames, 2]`` for 2D signal input. + * When ``transpose_frames == false`` the output shape is ``[frames, fft_results, 2]`` for 1D signal input or ``[batch, frames, fft_results, 2]`` for 2D signal input. + * When ``transpose_frames == true`` the output shape is ``[fft_results, frames, 2]`` for 1D signal input or ``[batch, fft_results, frames, 2]`` for 2D signal input. - where: + where: - + ``batch`` is a batch size dimension - + ``frames`` is a number calculated as ``(signal_shape[-1] - frame_size) / frame_step) + 1`` - + ``fft_results`` is a number calculated as ``(frame_size / 2) + 1`` - + ``2`` is the last dimension is for complex value real and imaginary part + * ``batch`` is a batch size dimension + * ``frames`` is a number calculated as ``(signal_shape[-1] - frame_size) / frame_step) + 1`` + * ``fft_results`` is a number calculated as ``(frame_size / 2) + 1`` + * ``2`` is the last dimension is for complex value real and imaginary part **Types** @@ -61,7 +61,7 @@ Short Time Fourier Transformation for real-valued input (STFT) **Examples**: -*Example 1D signal, transpose_frames=false: * +*Example 1D signal, transpose_frames=false:* .. code-block:: xml :force: @@ -87,7 +87,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 1D signal, transpose_frames=true: * +*Example 1D signal, transpose_frames=true:* .. code-block:: xml :force: @@ -112,7 +112,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 2D signal, transpose_frames=false: * +*Example 2D signal, transpose_frames=false:* .. code-block:: xml :force: @@ -140,7 +140,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 2D signal, transpose_frames=true: * +*Example 2D signal, transpose_frames=true:* .. code-block:: xml :force: From 22c6740f79e8de50f6653cec7266fe9a5186caa9 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 30 Oct 2024 12:44:08 +0400 Subject: [PATCH 102/233] [PT FE] Unify conversion pipeline for ExportedProgram from memory and disk (#27324) **Details:** Before the fix, conversions of ExportedProgram from memory and disk use different decomposition sets **Tickets:** TBD Signed-off-by: Kazantsev, Roman --- .../moc_frontend/pytorch_frontend_utils.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py index d3b77c9a61f566..486f72d87fd89d 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py @@ -21,6 +21,22 @@ def extract_module_extensions(args): return {extension.module: extension for extension in extensions if isinstance(extension, ModuleExtension)} +def get_decoder_for_exported_program(model): + from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder + import torch + + from packaging import version + if version.parse(torch.__version__) >= version.parse("2.2"): + from torch._decomp import get_decompositions + from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list + decomp = get_decompositions(get_export_decomposition_list()) + model = model.run_decompositions(decomp_table=decomp) + gm = model.module() + log.debug(gm.code) + decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) + return decoder + + def get_pytorch_decoder(model, example_inputs, args): try: from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder @@ -49,15 +65,7 @@ def get_pytorch_decoder(model, example_inputs, args): inputs = prepare_torch_inputs(example_inputs) if not isinstance(model, (TorchScriptPythonDecoder, TorchFXPythonDecoder)): if hasattr(torch, "export") and isinstance(model, (torch.export.ExportedProgram)): - from packaging import version - if version.parse(torch.__version__) >= version.parse("2.2"): - from torch._decomp import get_decompositions - from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list - decomp = get_decompositions(get_export_decomposition_list()) - model = model.run_decompositions(decomp_table=decomp) - gm = model.module() - log.debug(gm.code) - decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) + decoder = get_decoder_for_exported_program(model) else: decoder = TorchScriptPythonDecoder( model, @@ -111,12 +119,7 @@ def get_pytorch_decoder_for_model_on_disk(argv, args): try: exported_program = torch.export.load(input_model) if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)): - from packaging import version - if version.parse(torch.__version__) >= version.parse("2.2"): - exported_program = exported_program.run_decompositions() - gm = exported_program.module() - decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) - argv.input_model = decoder + argv.input_model = get_decoder_for_exported_program(exported_program) argv.framework = 'pytorch' return True except: From 11cf409183ee45e930240c00cd3526f62db14abb Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 30 Oct 2024 13:29:55 +0400 Subject: [PATCH 103/233] [GPU] Disable OneDNN for unknown arch via dpas flag faking (#27326) ### Details: - This patch enforces dpas availability flag to false when HW architecture is unknown to onednn to fallback to OCL kernels which are supposed to be more generic and more forward compatible. - Also, added an architecture check in each onednn-based impl if in the future we'll stop relying on `supports_immad` flag when decide whether to use onednn or not. Signed-off-by: Vladimir Paramuzov --- .../src/graph/impls/onednn/concatenation_onednn.hpp | 2 +- .../src/graph/impls/onednn/convolution_onednn.hpp | 2 +- .../src/graph/impls/onednn/deconvolution_onednn.hpp | 2 +- .../src/graph/impls/onednn/fully_connected_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 8 ++++++++ 9 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp index e85bda18a034da..9e0a3fa5cfb390 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp @@ -19,7 +19,7 @@ struct ConcatenationImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; static const std::vector supported_types = { ov::element::f16, ov::element::u8, ov::element::i8 }; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp index a5616167506f70..c3f599fc5db9f6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp @@ -24,7 +24,7 @@ struct ConvolutionImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& conv_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp index 949c979ed77e80..039cf36261caa0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp @@ -20,7 +20,7 @@ struct DeconvolutionImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& deconv_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index f4495fb5dd1645..a601b2c74c09e3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -22,7 +22,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& fc_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp index f89d3e588735e2..6c576d177043ee 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp @@ -19,7 +19,7 @@ struct GemmImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& gemm_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp index 343fe66771de25..4710b0c77b83c7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp @@ -20,7 +20,7 @@ struct PoolingImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& in_layout = node.get_input_layout(0); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp index fbdf64131ff384..68d963fd9e369f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp @@ -49,7 +49,7 @@ struct ReduceImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& reduce_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp index b671f5e210e75c..ad08c516e939d8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp @@ -57,7 +57,7 @@ struct ReorderImplementationManager : public ImplementationManager { return true; const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; if (!one_of(input_fmt.value, supported_formats) || !one_of(output_fmt.value, supported_formats)) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 88801b8b2b4e61..7ab48308cfeaf7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -330,6 +330,14 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex ngen::Product product = {ngen::ProductFamily::Unknown, 0}; jit_generator::detectHWInfo(context.get(), device.get(), hw, product); info.arch = convert_ngen_arch(hw); + // We change the value of this flag to avoid OneDNN usage for the platforms unknown to OneDNN + // This is required to guarantee some level of forward compatibility for the new HW generations + // as OneDNN code generators are not generic and typically requires some updates for the new architectures + // Ideally, we shouldn't do that as OCL impls sometimes also check this flag, but in order to avoid that + // we need to ensure that graph transformations are not relying on this flag as indicator that onednn will be used + if (product.family == ngen::ProductFamily::Unknown) { + info.supports_immad = false; + } #else // ENABLE_ONEDNN_FOR_GPU info.arch = gpu_arch::unknown; #endif // ENABLE_ONEDNN_FOR_GPU From cb292c750056a956b66bc7871dcf8688e4ca0a1e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 30 Oct 2024 13:53:21 +0400 Subject: [PATCH 104/233] add support aten::__ior__ (#27315) ### Details: - *add support `aten::__ior__`, `aten::__iand__`, `aten::__ixor__`* ### Tickets: - *CVS-156301* --- src/frontends/pytorch/src/op_table.cpp | 3 + .../pytorch_tests/test_bitwise_ops.py | 58 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index d0e388b5d08cf1..607f0bd32db80d 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -331,12 +331,15 @@ OP_CONVERTER(translate_zeros_like_fx); const std::unordered_map get_supported_ops_ts() { return { {"aten::__and__", op::translate_bitwise_and}, + {"aten::__iand__", op::inplace_op}, {"aten::__derive_index", op::translate_derive_index}, {"aten::__getitem__", op::translate_getitem}, {"aten::__not__", op::translate_1to1_match_1_inputs}, {"aten::__or__", op::translate_bitwise_or}, + {"aten::__ior__", op::inplace_op}, {"aten::__range_length", op::translate_range_length}, {"aten::__xor__", op::translate_bitwise_xor}, + {"aten::__ixor__", op::inplace_op}, {"aten::_convolution", op::translate_convolution}, {"aten::_convolution_mode", op::translate_convolution_mode}, {"aten::_native_multi_head_attention", op::translate_native_multi_head_attention}, diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py index a400f6dcd76d17..125402b4dbec17 100644 --- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py +++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py @@ -140,3 +140,61 @@ def test_bitwise_operators(self, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_ trace_model=True, freeze_model=False, ) + + +class TestBitwiseInplaceOp(PytorchLayerTest): + def _prepare_input(self, lhs_shape, rhs_shape, dtype): + choices = np.array([0, 1, 255, 7]) + x = np.random.choice(choices, lhs_shape).astype(dtype) + y = np.random.choice(choices, rhs_shape).astype(dtype) + return x, y + + def create_model(self, op): + class aten_bitwise(torch.nn.Module): + def __init__(self, op) -> None: + super().__init__() + if op == "aten::__ior__": + self.forward = self.forward_or + if op == "aten::__iand__": + self.forward = self.forward_and + if op == "aten::__ixor__": + self.forward = self.forward_xor + + def forward_or(self, lhs, rhs): + return lhs.__ior__(rhs) + + def forward_and(self, lhs, rhs): + return lhs.__iand__(rhs) + + def forward_xor(self, lhs, rhs): + return lhs.__ixor__(rhs) + + return aten_bitwise(op), None, op + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("dtype", ["bool", "int32"]) + @pytest.mark.parametrize( + ("lhs_shape", "rhs_shape"), + [ + ([2, 3], [2, 3]), + ([2, 3], []), + ], + ) + @pytest.mark.parametrize("op", ["aten::__ior__", "aten::__iand__", "aten::__ixor__"]) + def test_bitwise_operators(self, op, dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version): + if ie_device == "GPU" and dtype != "bool": + pytest.xfail(reason="bitwise ops are not supported on GPU") + self._test( + *self.create_model(op), + ie_device, + precision, + ir_version, + kwargs_to_prepare_input={ + "dtype": dtype, + "lhs_shape": lhs_shape, + "rhs_shape": rhs_shape, + }, + trace_model=True, + freeze_model=False, + ) \ No newline at end of file From a7ff891fa552532091576ea39e883c30c5fcf241 Mon Sep 17 00:00:00 2001 From: David Nam Date: Wed, 30 Oct 2024 18:41:08 +0800 Subject: [PATCH 105/233] [GPU] Init tensor.data when allocating inputs for string type (#27269) ### Details: - In case the element type is string produces the segmentation fault when input data is an empty string, unless the each element of tensor.data is initialized. ### Tickets: - 148921 --- src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp | 6 ++++++ .../layer_tests/tensorflow_tests/test_tf_LookupTableSize.py | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 26771117e2e786..985336b801b9d3 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -592,6 +592,12 @@ void SyncInferRequest::allocate_input(const ov::Output& port, si auto element_type = port.get_element_type(); m_user_inputs[input_idx] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; + if (element_type == ov::element::string) { + // In case the element type is string and input data is an empty string, + // it produces the segmentation fault unless the each element of tensor.data is initialized. + auto data = m_user_inputs.at(input_idx).ptr->data(); + std::uninitialized_fill_n(data, m_user_inputs.at(input_idx).ptr->get_size(), std::string()); + } ov::ISyncInferRequest::set_tensor(port, m_user_inputs.at(input_idx).ptr); } diff --git a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py index e0050c245f1321..4cd5b05f3e86d4 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py @@ -69,8 +69,6 @@ def create_lookup_table_size_net(self, hash_table_type, keys_type, values_type, def test_lookup_table_size(self, hash_table_type, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): keys_type = params['keys_type'] - if ie_device == 'GPU' and keys_type == str: - pytest.skip("148921: Segmentation fault on GPU") self._test(*self.create_lookup_table_size_net(hash_table_type=hash_table_type, **params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) From 2441dcdbcf2f9e996679f72f70faf7ba611fe928 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 30 Oct 2024 16:09:03 +0400 Subject: [PATCH 106/233] [TF FE] Stabilize tests for UnsortedSegmentSum operation on all platforms (#27325) **Details:** Stabilize tests for UnsortedSegmentSum operation on all platforms **Ticket:** TBD --------- Signed-off-by: Kazantsev, Roman --- .../test_tf_UnsortedSegmentSum.py | 37 +++++++------------ 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py index 3369aeb8aad231..ccf7c16896270c 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import numpy as np -import platform import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(23254) + class TestUnsortedSegmentSum(CommonTFLayerTest): def _prepare_input(self, inputs_info): @@ -15,10 +16,10 @@ def _prepare_input(self, inputs_info): data_shape = inputs_info['data:0'] segment_ids_shape = inputs_info['segment_ids:0'] inputs_data = {} - inputs_data['data:0'] = np.random.randint(-50, 50, data_shape).astype(self.data_type) + inputs_data['data:0'] = rng.integers(-10, 10, data_shape).astype(self.data_type) # segment_ids can have negative values - inputs_data['segment_ids:0'] = np.random.randint(-self.num_segments_val, self.num_segments_val, - segment_ids_shape) + inputs_data['segment_ids:0'] = rng.integers(-self.num_segments_val, self.num_segments_val, + segment_ids_shape).astype(self.segment_ids_type) return inputs_data def create_unsorted_segment_sum_net(self, data_shape, segment_ids_shape, num_segments_val, data_type, @@ -48,28 +49,18 @@ def create_unsorted_segment_sum_net(self, data_shape, segment_ids_shape, num_seg ] @pytest.mark.parametrize("params", test_data_basic) - @pytest.mark.parametrize("data_type", [ - np.float32, np.int32 - ]) - @pytest.mark.parametrize("segment_ids_type", [ - np.int32, np.int64 - ]) - @pytest.mark.parametrize("num_segments_type", [ - np.int32, np.int64 - ]) + @pytest.mark.parametrize("data_type", [np.float32, np.int32]) + @pytest.mark.parametrize("segment_ids_type", [np.int32, np.int64]) + @pytest.mark.parametrize("num_segments_type", [np.int32, np.int64]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_unsorted_segment_sum_basic(self, params, data_type, segment_ids_type, num_segments_type, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): - if use_legacy_frontend: - pytest.skip("UnsortedSegmentSum operation is not supported via legacy frontend.") if ie_device == 'GPU': - pytest.skip("GPU error: Can't choose implementation for embedding_segment_sum:UnsortedSegmentSum node") - self._test( - *self.create_unsorted_segment_sum_net(**params, data_type=data_type, segment_ids_type=segment_ids_type, - num_segments_type=num_segments_type), - ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend) + pytest.skip("156362: No layout format available for embeddingsegmentssum:UnsortedSegmentSum on GPU") + self._test(*self.create_unsorted_segment_sum_net(**params, + data_type=data_type, segment_ids_type=segment_ids_type, + num_segments_type=num_segments_type), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) From 118efc85baaa1c1ea04b280ead81e540b24acf15 Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Wed, 30 Oct 2024 13:35:40 +0100 Subject: [PATCH 107/233] Docs Added searching by new coveo Category (#27335) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../templates/layout.html | 2 +- .../sphinx_setup/_static/css/coveo_custom.css | 19 +- docs/sphinx_setup/_static/js/custom.js | 41 ++-- docs/sphinx_setup/_templates/layout.html | 11 +- docs/sphinx_setup/_templates/search.html | 221 ++++++++---------- docs/sphinx_setup/conf.py | 1 - 6 files changed, 134 insertions(+), 161 deletions(-) diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html index 25acb3c1e5cbda..a2ab53c6a57a83 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html @@ -28,7 +28,7 @@ {# The data-cfasync attribute disables CloudFlare's Rocket loader so that #} {# mode/theme are correctly set before the browser renders the page. #} {# https://github.com/pydata/pydata-sphinx-theme/pull/1045 #} - + - - - - + + - - - - {% endblock %} - {% block docs_navbar %} {{ super() }} {% include 'baner.html' %} diff --git a/docs/sphinx_setup/_templates/search.html b/docs/sphinx_setup/_templates/search.html index 3519f6e7e02f19..5430f24f74aa8c 100644 --- a/docs/sphinx_setup/_templates/search.html +++ b/docs/sphinx_setup/_templates/search.html @@ -2,133 +2,100 @@ {% set title = _('Search') %} {%- block content %} - {% block docs_navbar %} - {{ super() }} - {% include 'baner.html' %} - {% endblock %} +{% block docs_navbar %} +{{ super() }} +{% include 'baner.html' %} +{% endblock %} - {% block body %} - - - - - - - - - - - -
- - -
- - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- {% endblock %} - - {%- block scripts_end %} - {{ _webpack.body_post() }} - {%- endblock %} +{% block body %} + + + + + + + + +
+ + +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+
+{% endblock %} +{%- block scripts_end %} +{{ _webpack.body_post() }} {%- endblock %} + +{%- endblock %} \ No newline at end of file diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py index def41af5943b3c..01c74de0175bcf 100644 --- a/docs/sphinx_setup/conf.py +++ b/docs/sphinx_setup/conf.py @@ -193,7 +193,6 @@ 'css/textfield.css', 'css/tabs.css', 'css/coveo_custom.css', - 'https://static.cloud.coveo.com/atomic/v2/themes/coveo.css', 'https://cdn.jsdelivr.net/npm/@splidejs/splide@4.1.4/dist/css/splide.min.css', ] From a5a09418cf4f047be8a58a1484da1b45488f8dba Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 22:23:42 +0900 Subject: [PATCH 108/233] [GPU] Fix cache mode and weights path interaction (#27328) ### Details: - Currently ov::CacheMode::OPTIMIZE_SIZE behaves like ov::CacheMode::OPTIMIZE_SPEED if weights_path is provided. This change fixes that. - Additionally, after this change if cache is saved with OPTIMIZE_SIZE and the user tries to load with OPTIMIZE_SPEED (or vice versa), import_model() will fail and the workload will behave like during the first launch, according to the cache mode set by the user. - This change also tightens the weights_path value validation - only files with ".bin" extension will be accepted. However, if the user provides the path to the wrong bin file, the execution will still fail - there's no way to validate if the bin file is correct without storing information about it in the cache. ### Tickets: - 156265 --------- Co-authored-by: Tomasz Krupa --- .../util/include/openvino/util/weights_path.hpp | 15 +++++++++++++++ src/common/util/src/weights_path.cpp | 14 ++++++++++++++ src/plugins/intel_gpu/src/graph/program.cpp | 4 +++- .../intel_gpu/src/plugin/compiled_model.cpp | 8 ++++++-- src/plugins/intel_gpu/src/plugin/plugin.cpp | 11 ++++++++++- .../intel_gpu/src/plugin/program_builder.cpp | 15 +++++++++------ 6 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 src/common/util/include/openvino/util/weights_path.hpp create mode 100644 src/common/util/src/weights_path.cpp diff --git a/src/common/util/include/openvino/util/weights_path.hpp b/src/common/util/include/openvino/util/weights_path.hpp new file mode 100644 index 00000000000000..db97484be98d35 --- /dev/null +++ b/src/common/util/include/openvino/util/weights_path.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/util/common_util.hpp" + +namespace ov { +namespace util { + +bool validate_weights_path(std::string& weights_path); + +} // namespace ov +} // namespace util diff --git a/src/common/util/src/weights_path.cpp b/src/common/util/src/weights_path.cpp new file mode 100644 index 00000000000000..9cf2336f064dd0 --- /dev/null +++ b/src/common/util/src/weights_path.cpp @@ -0,0 +1,14 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + + +#include "openvino/util/weights_path.hpp" + +bool ov::util::validate_weights_path(std::string& weights_path) { + if (weights_path.empty() || !ov::util::ends_with(weights_path, ".bin")) { + return false; + } + + return true; +} diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 1e2e84043dc82b..07fad4873659cd 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -7,6 +7,7 @@ #include "openvino/core/type.hpp" #include "openvino/runtime/system_conf.hpp" #include "openvino/runtime/threading/cpu_streams_info.hpp" +#include "openvino/util/weights_path.hpp" #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -1839,7 +1840,8 @@ void program::load(cldnn::BinaryInputBuffer& ib) { std::shared_ptr mapped_memory = nullptr; std::string weights_path = _config.get_property(ov::weights_path); - if (!weights_path.empty()) { + if (_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && + ov::util::validate_weights_path(weights_path)) { mapped_memory = ov::load_mmap_object(weights_path); } diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 233bc97c249cd4..527e08f07432ef 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -5,6 +5,7 @@ #include "openvino/runtime/iplugin.hpp" #include "openvino/runtime/intel_gpu/properties.hpp" #include "openvino/runtime/internal_properties.hpp" +#include "openvino/util/weights_path.hpp" #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -169,14 +170,17 @@ std::shared_ptr CompiledModel::create_infer_request() co void CompiledModel::export_model(std::ostream& model) const { // If ov::CacheMode::OPTIMIZE_SIZE is set, do the export iff it's possible to do weightless caching // which requires the weights_path. - if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && - m_config.get_property(ov::weights_path).empty()) + ov::CacheMode cache_mode = m_config.get_property(ov::cache_mode); + std::string weights_path = m_config.get_property(ov::weights_path); + if (cache_mode == ov::CacheMode::OPTIMIZE_SIZE && + !ov::util::validate_weights_path(weights_path)) return; OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "CompiledModel::export_model"); OPENVINO_ASSERT(!m_graphs.empty(), "[GPU] Model not loaded"); cldnn::BinaryOutputBuffer ob(model); + ob << cldnn::make_data(&cache_mode, sizeof(ov::CacheMode)); // Inputs { diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index d3d70ec92cd23c..7d010a9b590e2e 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -35,6 +35,7 @@ #include "openvino/runtime/performance_heuristics.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/util/common_util.hpp" +#include "openvino/util/weights_path.hpp" #include "transformations/common_optimizations/dimension_tracking.hpp" #include "transformations/init_node_info.hpp" #include "transformations/rt_info/fused_names_attribute.hpp" @@ -330,8 +331,16 @@ std::shared_ptr Plugin::import_model(std::istream& model, cldnn::BinaryInputBuffer ib(model, context_impl->get_engine()); + ov::CacheMode cache_mode; + ib >> cldnn::make_data(&cache_mode, sizeof(ov::CacheMode)); + + if (cache_mode != config.get_property(ov::cache_mode)) { + return nullptr; + } + + std::string weights_path = config.get_property(ov::weights_path); if (config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && - config.get_property(ov::weights_path).empty()) { + !ov::util::validate_weights_path(weights_path)) { return nullptr; } diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 510d715e7ac805..899110872ba633 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -305,12 +305,15 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptrorigin_op_name = op.get_friendly_name(); prim->origin_op_type_name = op.get_type_name(); - if (auto data_prim = dynamic_cast(prim.get())) { - auto rt_info = op.get_rt_info(); - auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); - if (weightless_cache_attr != rt_info.end()) { - data_prim->bin_offset = weightless_cache_attr->second.as().bin_offset; - data_prim->original_size = weightless_cache_attr->second.as().original_size; + if (this->m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE) { + if (auto data_prim = dynamic_cast(prim.get())) { + auto rt_info = op.get_rt_info(); + auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + if (weightless_cache_attr != rt_info.end()) { + data_prim->bin_offset = weightless_cache_attr->second.as().bin_offset; + data_prim->original_size = + weightless_cache_attr->second.as().original_size; + } } } From 37f5dd3455279eed6708783a4f8fce880290bc06 Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Wed, 30 Oct 2024 14:35:58 +0100 Subject: [PATCH 109/233] Docs Unhide columns in datatableJs (#27338) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../generative-ai-performance.rst | 15 ++++++--------- .../sphinx_setup/_static/js/openVinoDataTables.js | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index d0a04f16ceb6bd..b8256af650e2f8 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -17,16 +17,13 @@ running on an Intel® Core™ Ultra 7-165H, Intel® Core™ Ultra 7-265V, and In -.. tab-set:: - - .. tab-item:: OpenVINO - - .. csv-table:: - :class: modeldata stripe - :name: supportedModelsTableOv - :header-rows: 1 - :file: ../../_static/benchmarks_files/llm_models.csv +.. csv-table:: + :class: modeldata stripe + :name: supportedModelsTableOv + :header-rows: 1 + :file: ../../_static/benchmarks_files/llm_models.csv +| .. grid:: 1 1 2 2 :gutter: 4 diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js index 6f7231db424e89..b3f56b4a8de3e0 100644 --- a/docs/sphinx_setup/_static/js/openVinoDataTables.js +++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js @@ -1,4 +1,17 @@ $(document).ready(function () { + var pageTitle = document.title; + var columnDefs; + if(pageTitle.includes('Most Efficient Large Language Models for AI PC')) + { + columnDefs= [ + { "visible": false, "targets": [1, 2, 3, 4, 5] } + ] + } + else + { + columnDefs=[] + } + var table = $('table.modeldata').DataTable({ responsive: true, "autoWidth": false, @@ -12,6 +25,7 @@ $(document).ready(function () { [10, 25, 50, -1], ['10 rows', '25 rows', '50 rows', 'Show all rows'] ], + "columnDefs": columnDefs, layout: { topStart: { buttons: [ From b60449ea7c7255004fd1087e2f5aaad9f4404eb8 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Thu, 31 Oct 2024 00:08:20 +0800 Subject: [PATCH 110/233] [DOCS] Fix submodule update for building OV on Windows (#26549) - Add missing --recursive args for submodule update - Referenece guide for linux: https://github.com/openvinotoolkit/openvino/blob/6dc3f5538057caed5dd2eda0797aec445b6105bf/docs/dev/build_linux.md?plain=1#L23 --- docs/dev/build_windows.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/build_windows.md b/docs/dev/build_windows.md index 10049485202cca..4a9761f5364046 100644 --- a/docs/dev/build_windows.md +++ b/docs/dev/build_windows.md @@ -25,7 +25,7 @@ Supported configurations: ```sh git clone https://github.com/openvinotoolkit/openvino.git cd openvino - git submodule update --init + git submodule update --init --recursive ``` 2. Create build directory: From 967a730722fa69853da1cc3c62b4ef672ae03b7a Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Wed, 30 Oct 2024 16:42:17 +0000 Subject: [PATCH 111/233] [NPUW] Support 3d gather in head (#27258) --- .../plugin/npuw/partitioning/partitioning.cpp | 2 +- .../plugin/npuw/partitioning/patterns/opt.cpp | 66 ++++++++++++++----- .../plugin/npuw/partitioning/patterns/opt.hpp | 4 +- .../intel_npu/src/plugin/npuw/util.cpp | 28 +++++++- 4 files changed, 77 insertions(+), 23 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 954df49e39f99b..99705fef30e8a8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1788,7 +1788,7 @@ void Partitioner::optimize(const std::string& func_name) { // Run Head/Tail passes ov::pass::GraphRewrite rewr; - rewr.add_matcher(std::ref(ctx)); + rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); // NB: This pass is disabled for reason! It doesn't make things better diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 997d0e5108f8b9..9693e2e8f2b753 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -59,11 +59,21 @@ Context::PPtr Context::concat(ov::ParameterVector&& v, std::size_t dim) { } Context::PPtr Context::unpack(Context::PPtr w, Context::PPtr z, Context::PPtr s, ov::element::Type type) { - // FIXME: Assume CW only - NPUW_ASSERT(w->get_shape().size() == 2); - NPUW_ASSERT(z->get_shape().size() == 2); - NPUW_ASSERT(s->get_shape().size() == 2); - auto new_param = std::make_shared(type, w->get_shape()); + const auto& w_shape = w->get_shape(); + const auto& s_shape = s->get_shape(); + + Context::PPtr new_param; + if (w_shape.size() == 3 && s_shape.size() == 3) { + // Assume already reshaped tensor (as it does with unpack) + ov::Shape new_shape = {w_shape[0], w_shape[1] * w_shape[2]}; + new_param = std::make_shared(type, new_shape); + } else if (w_shape.size() == 2 && s_shape.size() == 2) { + new_param = std::make_shared(type, w_shape); + } else { + NPUW_ASSERT(false && "Yet unsupported combination"); + } + + NPUW_ASSERT(new_param); params_to_unpack[new_param] = {w, z, s}; return new_param; } @@ -350,8 +360,8 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 && ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && - act_shape.size() == 3 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[2] == 1 && - qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && + act_shape.size() == 3 && act_shape[0] == 1 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] && + qcoeff_shape[2] == 1 && qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { // Mark W closure to transpose, and transpose the respective parameter ctx.get().permute(matched_qweight, {1, 0, 2}); @@ -378,9 +388,6 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto split_a = std::make_shared(rshp_act, split_axis, NSPLIT); auto split_w = std::make_shared(matched_qweight, split_axis, NSPLIT); - std::vector rshp_scale_v = {1, 1, qcoeff_shape[0]}; - auto rshp_scale_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_scale_v); - // Do the CW MM for every split std::vector> to_concat; for (std::size_t i = 0; i < NSPLIT; i++) { @@ -583,9 +590,13 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { auto qcoeff_shape = matched_qcoeff->output(0).get_shape(); auto act_shape = matched_out_mmi.get_shape(); + const auto just_one = [](std::size_t a, std::size_t b) { + return (a == 1 && b > 1) || (a > 1 && b == 1); + }; + if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 && ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && - act_shape.size() == 3 && act_shape[1] > 1 && // multi-token case + act_shape.size() == 3 && just_one(act_shape[0], act_shape[1]) && // multi-token case qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == qweight_shape[1] && qcoeff_shape[2] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { // Mark W closure to transpose, and transpose the respective parameter @@ -601,9 +612,12 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { matched_qcoeff->set_partial_shape(ts_shape); matched_qcoeff->validate_and_infer_types(); + // Select proper activation shape + std::size_t act_dim = act_shape[0] > act_shape[1] ? 0 : 1; + // Reshape the Act to group format const auto NSPLIT = qweight_shape[1]; - std::vector rshp_act_v = {act_shape[1], NSPLIT, act_shape[2] / NSPLIT}; + std::vector rshp_act_v = {act_shape[act_dim], NSPLIT, act_shape[2] / NSPLIT}; auto rshp_act_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_act_v); auto rshp_act = std::make_shared(matched_out_mmi, rshp_act_c, false); @@ -615,7 +629,7 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { auto split_w = std::make_shared(matched_qweight, split_axis_w, NSPLIT); auto split_s = std::make_shared(matched_qcoeff, split_axis_w, NSPLIT); - std::vector r_a_v = {1, act_shape[1], act_shape[2] / NSPLIT}; + std::vector r_a_v = {1, act_shape[act_dim], act_shape[2] / NSPLIT}; auto r_a_c = std::make_shared(ov::element::i32, ov::Shape{3}, r_a_v); // Do the CW MM for every split @@ -642,6 +656,13 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { out = std::make_shared(out, ov::element::f32); } + if (act_shape[0] > act_shape[1]) { + std::vector new_out_size = {act_shape[0], act_shape[1], qweight_shape[0]}; + auto new_out_shape = + std::make_shared(ov::element::i32, ov::Shape{3}, new_out_size); + out = std::make_shared(out, new_out_shape, false); + } + // Now.. Reconnect the matmul readers to the new output (reducesum) for (auto&& r : matched_matmul->output(0).get_target_inputs()) { r.replace_source_output(out); @@ -933,7 +954,7 @@ DQLiftGatherSymGQ::DQLiftGatherSymGQ() { // the respective block (mainly, a head) was turned a function // (e.g. with FUNCALL_FOR_ALL) As in this case the DQDictMatMulCWu // compile-time converts asymmetric MM to fp16, do the same thing here -DQUnpackDictGatherCWu::DQUnpackDictGatherCWu(Context::Ref ctx) { +DQUnpackDictGatheru::DQUnpackDictGatheru(Context::Ref ctx) { auto pids = opp::wrap_type(); auto cvtids = opp::optional({pids->output(0)}); @@ -966,14 +987,23 @@ DQUnpackDictGatherCWu::DQUnpackDictGatherCWu(Context::Ref ctx) { // Strip down the DQ subgraph, replace the original Q-ed closure tensor with unpacked fp16 auto new_wi = ctx.get().unpack(matched_qweight, matched_qzerop, matched_qcoeff, ov::element::f16); - auto gather_c = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto new_g = std::make_shared(new_wi, matched_out_ids, gather_c); + auto w_shape = matched_node_qweight->get_shape(); + auto new_w_shape = new_wi->get_shape(); + std::shared_ptr gather_in = new_wi; + if (new_w_shape.size() == 2 && w_shape.size() == 3) { + NPUW_ASSERT(new_w_shape[0] == w_shape[0] && w_shape[1] * w_shape[2] == new_w_shape[1]); + auto new_const = std::make_shared(ov::element::i32, ov::Shape{3}, w_shape); + gather_in = std::make_shared(new_wi, new_const, false); + } + NPUW_ASSERT(gather_in); + auto gather_c = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto new_g = std::make_shared(gather_in, matched_out_ids, gather_c); matched_node_cvt->input(0).replace_source_output(new_g); return true; // root has changed }; - register_matcher(std::make_shared(qcvtm, "DQDictGatherCWu"), std::move(callback)); + register_matcher(std::make_shared(qcvtm, "DQDictGatheru"), std::move(callback)); } // This is a follow-up to DQLiftGatherSymGQ step, which happens if the respective @@ -1013,7 +1043,7 @@ DQUnpackDictGatherGQi::DQUnpackDictGatherGQi(Context::Ref ctx) { return true; // root has changed }; - register_matcher(std::make_shared(qcvtm, "DQDictGatherCWu"), std::move(callback)); + register_matcher(std::make_shared(qcvtm, "DQDictGatherGQu"), std::move(callback)); } // Identify the case* where the FP16/32 vocab tensor is gathered with diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index a66012d4a85fb8..323d443fa781f4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -112,9 +112,9 @@ class DQLiftGatherSymGQ : public ov::pass::MatcherPass { // Head vocab unpacks -class DQUnpackDictGatherCWu : public ov::pass::MatcherPass { +class DQUnpackDictGatheru : public ov::pass::MatcherPass { public: - DQUnpackDictGatherCWu(Context::Ref ctx); + DQUnpackDictGatheru(Context::Ref ctx); }; class DQUnpackDictGatherGQi : public ov::pass::MatcherPass { diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index e9deb34ee2ded7..99a53430295a89 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -176,6 +176,7 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, const auto& from_shape = from->get_shape(); const auto& scale_shape = scale->get_shape(); + const auto& zerop_shape = zerop->get_shape(); if (type_from == ov::element::u4) { if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1 && @@ -194,8 +195,31 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, NPUW_ASSERT(false); } } else if (type_from == ov::element::u8) { - // Only support CW for now - if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { + if (scale_shape.size() == 3 && scale_shape[1] == 1 && scale_shape[2] == 1) { + // Special case for broadcasting vocab by 2 dimensions + // FIXME: all this logic probably should be in some specific unpack or another util function + const auto& from_strides = from->get_strides(); + const auto& zerop_strides = zerop->get_strides(); + const auto& scale_strides = scale->get_strides(); + ov::Tensor wraped_from(from->get_element_type(), + ov::Shape{from_shape[0], from_shape[1] * from_shape[2]}, + from->data(), + ov::Strides{from_strides[0], from_strides[2]}); + ov::Tensor wraped_zerop(zerop->get_element_type(), + ov::Shape{zerop_shape[0], zerop_shape[1] * zerop_shape[2]}, + zerop->data(), + ov::Strides{zerop_strides[0], zerop_strides[2]}); + ov::Tensor wraped_scale(scale->get_element_type(), + ov::Shape{scale_shape[0], scale_shape[1] * scale_shape[2]}, + scale->data(), + ov::Strides{scale_strides[0], scale_strides[2]}); + + ov::npuw::util::XARCH::unpack_u8f16(ov::get_tensor_impl(wraped_from), + ov::get_tensor_impl(wraped_zerop), + ov::get_tensor_impl(wraped_scale), + to, + unpack_options); + } else if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { ov::npuw::util::XARCH::unpack_u8f16(from, zerop, scale, to, unpack_options); } else { NPUW_ASSERT(false); From 6e350492163a983aa10eb879e9d1179866f9ebf8 Mon Sep 17 00:00:00 2001 From: barnasm1 Date: Wed, 30 Oct 2024 18:14:32 +0100 Subject: [PATCH 112/233] squeeze v15 implementation (#26995) ### Details: - Add v15::Squeeze class with support dynamic rank result based on v0::Squeeze ### Tickets: - [*154023*](https://jira.devtools.intel.com/browse/CVS-154023) --- src/core/include/openvino/op/squeeze.hpp | 57 ++- .../include/openvino/op/util/squeeze_base.hpp | 39 ++ .../include/openvino/opsets/opset15_tbl.hpp | 2 +- .../include/squeeze_shape_inference.hpp | 119 +++++- src/core/src/op/squeeze.cpp | 112 +++-- src/core/src/op/util/squeeze_base.cpp | 91 ++++ src/core/tests/CMakeLists.txt | 1 + src/core/tests/type_prop/squeeze.cpp | 399 ++++++++++++++---- src/core/tests/visitors/op/squeeze.cpp | 9 + src/core/tests/visitors/op/unary_ops.hpp | 9 +- .../src/shape_inference/shape_inference.cpp | 1 + .../squeeze_shape_inference_test.cpp | 136 +++++- .../tests/functional/op_reference/squeeze.cpp | 63 ++- .../src/op_impl_check/single_op_graph.cpp | 10 +- .../include/common_test_utils/type_prop.hpp | 21 + 15 files changed, 887 insertions(+), 182 deletions(-) create mode 100644 src/core/include/openvino/op/util/squeeze_base.hpp create mode 100644 src/core/src/op/util/squeeze_base.cpp diff --git a/src/core/include/openvino/op/squeeze.hpp b/src/core/include/openvino/op/squeeze.hpp index 8c27f29d66df66..dde456aa2eef47 100644 --- a/src/core/include/openvino/op/squeeze.hpp +++ b/src/core/include/openvino/op/squeeze.hpp @@ -4,7 +4,7 @@ #pragma once -#include "openvino/op/op.hpp" +#include "openvino/op/util/squeeze_base.hpp" namespace ov { namespace op { @@ -12,30 +12,65 @@ namespace v0 { /// \brief Squeeze operation. /// /// \ingroup ov_ops_cpp_api -class OPENVINO_API Squeeze : public Op { +class OPENVINO_API Squeeze : public util::SqueezeBase { public: OPENVINO_OP("Squeeze", "opset1"); Squeeze(); - Squeeze(const Output& data, const Output& axes); + /// \brief Constructs a squeeze v0 operation. + /// + /// \param data Input tensor with data Squeeze(const Output& data); + /// \brief Constructs a squeeze v0 operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + Squeeze(const Output& data, const Output& axes); void validate_and_infer_types() override; bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override; - bool has_evaluate() const override; - bool evaluate_lower(TensorVector& outputs) const override; - bool evaluate_upper(TensorVector& outputs) const override; - bool evaluate_symbol(TensorSymbolVector& output_symbols) const override; - bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override; - bool can_constant_fold(const OutputVector& inputs_values) const override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - bool is_dynamic() const override; - private: Output get_default_axes_input() const; }; } // namespace v0 + +namespace v15 { +/// \brief Squeeze operation. +/// +/// \ingroup ov_ops_cpp_api +class OPENVINO_API Squeeze : public util::SqueezeBase { +public: + OPENVINO_OP("Squeeze", "opset15"); + + Squeeze(); + /// \brief Constructs a squeeze v15 operation. + /// + /// \param data Input tensor with data + /// \param allow_axis_skip Shape inference result dynamic rank if selected axis has 1 in range of its dynamic + Squeeze(const Output& data, const bool allow_axis_skip = false); + /// \brief Constructs a squeeze v15 operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + /// \param allow_axis_skip Shape inference result dynamic rank if selected axis has 1 in range of its dynamic + Squeeze(const Output& data, const Output& axes, const bool allow_axis_skip = false); + + void validate_and_infer_types() override; + bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool visit_attributes(AttributeVisitor& visitor) override; + + bool get_allow_axis_skip() const; + +private: + Output get_default_axes_input() const; + bool m_allow_axis_skip{}; +}; +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/include/openvino/op/util/squeeze_base.hpp b/src/core/include/openvino/op/util/squeeze_base.hpp new file mode 100644 index 00000000000000..50d960824e10d2 --- /dev/null +++ b/src/core/include/openvino/op/util/squeeze_base.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" + +namespace ov { +namespace op { +namespace util { +/// \brief Squeeze operation. +/// +/// \ingroup ov_ops_cpp_api +class OPENVINO_API SqueezeBase : public Op { +public: + OPENVINO_OP("Squeeze", "util"); + SqueezeBase() = default; + /// \brief Constructs a squeeze operation. + /// + /// \param data Input tensor with data + SqueezeBase(const Output& data); + /// \brief Constructs a squeeze operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + SqueezeBase(const Output& data, const Output& axes); + + bool has_evaluate() const override; + bool evaluate_lower(TensorVector& outputs) const override; + bool evaluate_upper(TensorVector& outputs) const override; + bool evaluate_symbol(TensorSymbolVector& output_symbols) const override; + bool can_constant_fold(const OutputVector& inputs_values) const override; + bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override; + bool is_dynamic() const override; +}; +} // namespace util +} // namespace op +} // namespace ov diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp index 9a49e421f9ad8e..8d12420719bb6b 100644 --- a/src/core/include/openvino/opsets/opset15_tbl.hpp +++ b/src/core/include/openvino/opsets/opset15_tbl.hpp @@ -97,7 +97,7 @@ _OPENVINO_OP_REG(Sqrt, ov::op::v0) _OPENVINO_OP_REG(SpaceToDepth, ov::op::v0) _OPENVINO_OP_REG(Split, ov::op::v1) _OPENVINO_OP_REG(SquaredDifference, ov::op::v0) -_OPENVINO_OP_REG(Squeeze, ov::op::v0) +_OPENVINO_OP_REG(Squeeze, ov::op::v15) _OPENVINO_OP_REG(StridedSlice, ov::op::v1) _OPENVINO_OP_REG(Subtract, ov::op::v1) _OPENVINO_OP_REG(Tan, ov::op::v0) diff --git a/src/core/shape_inference/include/squeeze_shape_inference.hpp b/src/core/shape_inference/include/squeeze_shape_inference.hpp index ee71b5452db1c3..31eeea5d36a9ea 100644 --- a/src/core/shape_inference/include/squeeze_shape_inference.hpp +++ b/src/core/shape_inference/include/squeeze_shape_inference.hpp @@ -11,6 +11,117 @@ namespace ov { namespace op { namespace v0 { +template > +std::vector shape_infer(const Squeeze* op, + const std::vector& input_shapes, + const ITensorAccessor& ta = make_tensor_accessor()) { + using DimType = typename T::value_type; + + const auto number_of_inputs = input_shapes.size(); + OPENVINO_ASSERT(!input_shapes.empty()); + + const auto& arg_shape = input_shapes[0]; + const auto& arg_rank = arg_shape.rank(); + auto output_shapes = std::vector(1); + auto& output_shape = output_shapes[0]; + + std::unique_ptr> unique_axes; + + if (number_of_inputs == 1) { + unique_axes.reset(new std::set()); + } else if (number_of_inputs == 2) { + const auto& axes_shape = input_shapes[1]; + NODE_VALIDATION_CHECK(op, + axes_shape.is_dynamic() || ov::util::is_rank_compatible_any_of(axes_shape.rank(), {0, 1}), + "Second input (axes) should not be of rank higher than 1. Got: ", + axes_shape.rank().get_length()); + + std::vector axes; + if (arg_rank.is_static() && axes_shape.is_static()) { + if (auto axes = get_input_const_data_as(op, 1, ta)) { + // The values of `axes` input are known + ov::util::try_normalize_axes(*axes, arg_rank, *op); + unique_axes.reset(new std::set(axes->cbegin(), axes->cend())); + } else if (arg_rank.get_length() > 0 && shape_size(axes_shape.to_shape()) == 1) { + // The `axes` input is a single element tensor which is unique by definition, deducing output rank + const auto has_squeezable_dim = + std::any_of(arg_shape.cbegin(), arg_shape.cend(), [](const DimType& dim) { + return dim.compatible(1); + }); + if (has_squeezable_dim) { + output_shape = PartialShape::dynamic(arg_rank.get_length() - 1); + } else { + output_shape = arg_shape; + } + return output_shapes; + } + } + } else { + // Invalid number of inputs, empty error message for backward compatibility. + NODE_VALIDATION_CHECK(op, false); + } + + if (arg_rank.is_static() && (unique_axes != nullptr)) { + output_shape.resize(0); + if (unique_axes->empty()) { + // if only first input provided or axes are empty remove all dimensions equal to 1. + if (std::any_of(arg_shape.cbegin(), arg_shape.cend(), [](const DimType& d) { + return d.is_dynamic() && d.compatible(1); + })) { + // we are unsure if dynamic dimensions would be equal to 1 or not, so we set dynamic output rank + output_shape = PartialShape::dynamic(); + return output_shapes; + } else { + std::copy_if(arg_shape.cbegin(), + arg_shape.cend(), + std::back_inserter(output_shape), + [](const DimType& dim) { + return !dim.compatible(1); + }); + } + } else { + int64_t idx = 0; + auto rm_axis_iter = unique_axes->cbegin(); + auto rm_axis_end = unique_axes->cend(); + + // Returns true if dimension not squeezable on axis from input axes. + const auto not_squeezable_at_axis = [&rm_axis_iter, &rm_axis_end, &idx](const DimType& dim) { + if ((rm_axis_iter != rm_axis_end) && (*rm_axis_iter == idx++)) { + ++rm_axis_iter; + // Ignore: Pointed by axis, but not squeezable + return !dim.compatible(1); + } else { + return true; + } + }; + + std::copy_if(arg_shape.cbegin(), + arg_shape.cend(), + std::back_inserter(output_shape), + not_squeezable_at_axis); + } + } else { + output_shape = PartialShape::dynamic(); + } + return output_shapes; +} +} // namespace v0 + +namespace v15 { +template +bool apply_allow_axis_skip(const ov::op::v15::Squeeze* const op, + const std::unique_ptr>& unique_axes, + const T& arg_shape) { + using DimType = typename T::value_type; + int64_t i{-1}; + + return op->get_allow_axis_skip() && + std::any_of(arg_shape.cbegin(), arg_shape.cend(), [&unique_axes, &i](const DimType& d) { + ++i; + // Squeeze result with dynamic rank if 1 is in range of selected dynamic dimension. + return d.is_dynamic() && d.compatible(1) && unique_axes->find(i) != unique_axes->end(); + }); +} /** * \brief Do Squeeze shape inference. @@ -59,7 +170,7 @@ std::vector shape_infer(const Squeeze* op, return dim.compatible(1); }); if (has_squeezable_dim) { - output_shape = PartialShape::dynamic(arg_rank.get_length() - 1); + output_shape = PartialShape::dynamic(); } else { output_shape = arg_shape; } @@ -71,7 +182,9 @@ std::vector shape_infer(const Squeeze* op, NODE_VALIDATION_CHECK(op, false); } - if (arg_rank.is_static() && (unique_axes != nullptr)) { + if (!arg_rank.is_static() || (unique_axes == nullptr) || apply_allow_axis_skip(op, unique_axes, arg_shape)) { + output_shape = PartialShape::dynamic(); + } else if (arg_rank.is_static() && (unique_axes != nullptr)) { output_shape.resize(0); if (unique_axes->empty()) { // if only first input provided or axes are empty remove all dimensions equal to 1. @@ -115,6 +228,6 @@ std::vector shape_infer(const Squeeze* op, } return output_shapes; } -} // namespace v0 +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/src/op/squeeze.cpp b/src/core/src/op/squeeze.cpp index 1b34a4e48a4faf..b79165ca4f5543 100644 --- a/src/core/src/op/squeeze.cpp +++ b/src/core/src/op/squeeze.cpp @@ -6,31 +6,19 @@ #include -#include "bound_evaluate.hpp" #include "itt.hpp" -#include "openvino/core/validation_util.hpp" -#include "openvino/op/constant.hpp" #include "squeeze_shape_inference.hpp" namespace ov { namespace op { namespace v0 { -namespace validate { -namespace { +Squeeze::Squeeze() : util::SqueezeBase() {} -bool axes_has_and_set_bound(const Node& op) { - return (op.get_input_size() < 2) || op.get_input_tensor(1).has_and_set_bound(); -} -} // namespace -} // namespace validate - -Squeeze::Squeeze() : Op() {} - -Squeeze::Squeeze(const Output& data, const Output& axes) : Op({data, axes}) { +Squeeze::Squeeze(const Output& data, const Output& axes) : util::SqueezeBase(data, axes) { constructor_validate_and_infer_types(); } -Squeeze::Squeeze(const Output& data) : Op({data}) { +Squeeze::Squeeze(const Output& data) : util::SqueezeBase(data) { constructor_validate_and_infer_types(); } @@ -69,62 +57,68 @@ bool Squeeze::evaluate(TensorVector& outputs, const TensorVector& inputs) const return true; } -bool Squeeze::has_evaluate() const { - OV_OP_SCOPE(v0_Squeeze_has_evaluate); - const auto validate_axes_type = [](const element::Type& et) -> bool { - switch (et) { - case element::i8: - case element::i16: - case element::i32: - case element::i64: - case element::u8: - case element::u16: - case element::u32: - case element::u64: - return true; - default: - return false; - } - }; - - return (get_input_size() < 2) || validate_axes_type(get_input_element_type(1)); -} +} // namespace v0 -bool Squeeze::evaluate_lower(TensorVector& output_values) const { - OV_OP_SCOPE(v0_Squeeze_evaluate_lower); - return validate::axes_has_and_set_bound(*this) && default_lower_bound_evaluator(this, output_values); +namespace v15 { +Squeeze::Squeeze() : util::SqueezeBase() {} + +Squeeze::Squeeze(const Output& data, const bool allow_axis_skip) + : util::SqueezeBase(data), + m_allow_axis_skip{allow_axis_skip} { + constructor_validate_and_infer_types(); } -bool Squeeze::evaluate_upper(TensorVector& output_values) const { - OV_OP_SCOPE(v0_Squeeze_evaluate_upper); - return validate::axes_has_and_set_bound(*this) && default_upper_bound_evaluator(this, output_values); +Squeeze::Squeeze(const Output& data, const Output& axes, const bool allow_axis_skip) + : util::SqueezeBase(data, axes), + m_allow_axis_skip{allow_axis_skip} { + constructor_validate_and_infer_types(); } -bool Squeeze::evaluate_symbol(TensorSymbolVector& output_symbols) const { - return validate::axes_has_and_set_bound(*this) && ov::util::default_symbol_evaluator(this, output_symbols); +std::shared_ptr Squeeze::clone_with_new_inputs(const OutputVector& new_args) const { + OV_OP_SCOPE(v15_Squeeze_clone_with_new_inputs); + check_new_args_count(this, new_args); + + switch (new_args.size()) { + case 1: + return std::make_shared(new_args[0], m_allow_axis_skip); + case 2: + return std::make_shared(new_args[0], new_args[1], m_allow_axis_skip); + default: + OPENVINO_THROW("Incorrect number of new arguments"); + } } -bool Squeeze::can_constant_fold(const OutputVector& inputs_values) const { - return get_output_partial_shape(0).is_static() && !is_const_fold_disabled(); +void Squeeze::validate_and_infer_types() { + OV_OP_SCOPE(v15_Squeeze_validate_and_infer_types); + + const auto input_shapes = ov::util::get_node_input_partial_shapes(*this); + const auto output_shapes = shape_infer(this, input_shapes); + + set_output_type(0, get_input_element_type(0), output_shapes[0]); } -bool Squeeze::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) { - OV_OP_SCOPE(v0_Squeeze_constant_fold); - if (!can_constant_fold(inputs_values)) { - return false; - } +bool Squeeze::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + OV_OP_SCOPE(v15_Squeeze_evaluate); + OPENVINO_ASSERT(outputs.size() == 1); - if (auto data_const = std::dynamic_pointer_cast(inputs_values[0].get_node_shared_ptr())) { - const auto& shape = get_output_shape(0); - output_values[0] = std::make_shared(*data_const, shape); - return true; - } - return false; + const auto output_shapes = + shape_infer(this, ov::util::get_tensors_partial_shapes(inputs), make_tensor_accessor(inputs)); + outputs[0].set_shape(output_shapes.front().get_shape()); + + std::memcpy(outputs[0].data(), inputs[0].data(), outputs[0].get_byte_size()); + return true; } -bool Squeeze::is_dynamic() const { - return get_output_partial_shape(0).is_dynamic(); +bool Squeeze::visit_attributes(AttributeVisitor& visitor) { + OV_OP_SCOPE(v15_Squeeze_visit_attributes); + visitor.on_attribute("allow_axis_skip", m_allow_axis_skip); + return true; } -} // namespace v0 + +bool Squeeze::get_allow_axis_skip() const { + OV_OP_SCOPE(v15_Squeeze_get_allow_axis_skip); + return m_allow_axis_skip; +} +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/src/op/util/squeeze_base.cpp b/src/core/src/op/util/squeeze_base.cpp new file mode 100644 index 00000000000000..be5a20cbb58620 --- /dev/null +++ b/src/core/src/op/util/squeeze_base.cpp @@ -0,0 +1,91 @@ +#include "openvino/op/util/squeeze_base.hpp" + +#include "bound_evaluate.hpp" +#include "itt.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/op/constant.hpp" + +namespace ov { +namespace op { + +namespace validate { +namespace { + +bool axes_has_and_set_bound(const Node& op) { + return (op.get_input_size() < 2) || op.get_input_tensor(1).has_and_set_bound(); +} +} // namespace +} // namespace validate + +namespace util { +SqueezeBase::SqueezeBase(const Output& data, const Output& axes) : Op({data, axes}) { + constructor_validate_and_infer_types(); +} + +SqueezeBase::SqueezeBase(const Output& data) : Op({data}) { + constructor_validate_and_infer_types(); +} + +bool SqueezeBase::has_evaluate() const { + OV_OP_SCOPE(util_SqueezeBase_has_evaluate); + const auto validate_axes_type = [](const element::Type& et) -> bool { + switch (et) { + case element::i8: + case element::i16: + case element::i32: + case element::i64: + case element::u8: + case element::u16: + case element::u32: + case element::u64: + return true; + default: + return false; + } + }; + + return (get_input_size() < 2) || validate_axes_type(get_input_element_type(1)); +} + +bool SqueezeBase::evaluate_lower(TensorVector& output_values) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_lower); + return validate::axes_has_and_set_bound(*this) && default_lower_bound_evaluator(this, output_values); +} + +bool SqueezeBase::evaluate_upper(TensorVector& output_values) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_upper); + return validate::axes_has_and_set_bound(*this) && default_upper_bound_evaluator(this, output_values); +} + +bool SqueezeBase::evaluate_symbol(TensorSymbolVector& output_symbols) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_symbol); + return validate::axes_has_and_set_bound(*this) && ov::util::default_symbol_evaluator(this, output_symbols); +} + +bool SqueezeBase::can_constant_fold(const OutputVector& inputs_values) const { + OV_OP_SCOPE(util_SqueezeBase_can_constant_fold); + return get_output_partial_shape(0).is_static() && !is_const_fold_disabled(); +} + +bool SqueezeBase::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) { + OV_OP_SCOPE(util_SqueezeBase_constant_fold); + if (!can_constant_fold(inputs_values)) { + return false; + } + + if (auto data_const = std::dynamic_pointer_cast(inputs_values[0].get_node_shared_ptr())) { + const auto& shape = get_output_shape(0); + output_values[0] = std::make_shared(*data_const, shape); + return true; + } + return false; +} + +bool SqueezeBase::is_dynamic() const { + OV_OP_SCOPE(util_SqueezeBase_is_dynamic); + return get_output_partial_shape(0).is_dynamic(); +} + +} // namespace util +} // namespace op +} // namespace ov diff --git a/src/core/tests/CMakeLists.txt b/src/core/tests/CMakeLists.txt index c3ed58783ac946..89acd7bd1809d0 100644 --- a/src/core/tests/CMakeLists.txt +++ b/src/core/tests/CMakeLists.txt @@ -18,6 +18,7 @@ set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/threading.cpp if(SUGGEST_OVERRIDE_SUPPORTED) set_source_files_properties(ov_tensor_test.cpp type_prop/multiclass_nms.cpp + type_prop/squeeze.cpp PROPERTIES COMPILE_OPTIONS -Wno-suggest-override) endif() diff --git a/src/core/tests/type_prop/squeeze.cpp b/src/core/tests/type_prop/squeeze.cpp index c7d81fd97c2786..7be05de1876d9f 100644 --- a/src/core/tests/type_prop/squeeze.cpp +++ b/src/core/tests/type_prop/squeeze.cpp @@ -7,193 +7,261 @@ #include "common_test_utils/test_assertions.hpp" #include "common_test_utils/type_prop.hpp" #include "openvino/op/broadcast.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/shape_of.hpp" -#include "openvino/op/unsqueeze.hpp" #include "sequence_generator.hpp" using namespace std; using namespace ov; using namespace testing; -TEST(type_prop, squeeze_axes_invalid_value) { +namespace { + +template +class SqueezelOperator : public TypePropOpTest {}; + +using SqueezeTypes = ::testing::Types; + +TYPED_TEST_SUITE(SqueezelOperator, SqueezeTypes); + +TYPED_TEST(SqueezelOperator, squeeze_axes_invalid_value) { auto param = make_shared(element::f32, Shape{1, 2, 3, 4}); auto axes_node = make_shared(element::u64, Shape{2}, vector{0, 2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), (PartialShape{2, 3, 4})); } -TEST(type_prop, squeeze_single_input) { +TYPED_TEST(SqueezelOperator, squeeze_single_input) { auto param = make_shared(element::f32, PartialShape{1, -1, 3, 4}); - auto s = make_shared(param); - EXPECT_EQ(s->get_output_partial_shape(0), PartialShape::dynamic()); + const auto squeeze = this->make_op(param); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_axes_invalid_rank) { +TYPED_TEST(SqueezelOperator, squeeze_axes_invalid_rank) { auto param = make_shared(element::f32, Shape{1, 2, 3, 4}); auto axes_node = make_shared(element::i32, Shape{2, 1}, vector{0, 2}); - OV_EXPECT_THROW(auto s = make_shared(param, axes_node), + OV_EXPECT_THROW(const auto squeeze = this->make_op(param, axes_node), NodeValidationFailure, HasSubstr("Second input (axes) should not be of rank higher than 1.")); } -TEST(type_prop, squeeze_incorrect_negative_axes) { +TYPED_TEST(SqueezelOperator, squeeze_incorrect_negative_axes) { auto param = make_shared(element::f32, Shape{1, 4, 1, 4, 1, 8}); auto axes_node = make_shared(element::i64, Shape{2}, vector{-6, -10}); - OV_EXPECT_THROW(auto s = make_shared(param, axes_node), + OV_EXPECT_THROW(const auto squeeze = this->make_op(param, axes_node), ov::Exception, HasSubstr("Axis -10 out of the tensor rank range")); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_no_squeezable_dims) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_single_elem_static_shape_no_squeezable_dims) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 2, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), (PartialShape{2, 2, 4})); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_two) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_two) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(2)); } -TEST(type_prop, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { + auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(2)); } -TEST(type_prop, squeeze_data_scalar_param_axes_1D_single_elem_static_shape) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { + auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_scalar_param_axes_1D_single_elem_static_shape) { auto param = std::make_shared(ov::element::f32, PartialShape{}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_equal) { +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_equal) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(4)); } -TEST(type_prop, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(4)); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } -TEST(type_prop, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } -TEST(type_prop, squeeze_data_dyamic_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_dyamic_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, -1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_three_elem_static_shape_squeezable_dims_two) { +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_three_elem_static_shape_squeezable_dims_two) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{3}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + using SqueezeTypePropTestParam = std::tuple, // Squeeze axis PartialShape // Expected shape @@ -288,26 +356,44 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_default_axes, TEST_P(SqueezeTest, partial_shape_dimension_propagation_const_axis_i32) { const auto axes_node = std::make_shared(element::i32, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } TEST_P(SqueezeTest, partial_shape_dimension_propagation_parameter_axes_no_data) { const auto axes_node = std::make_shared(element::u64, PartialShape{Shape{axes.size()}}); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + } } TEST_P(SqueezeTest, partial_shape_dimension_propagation_dynamic_axes) { const auto axes_node = std::make_shared(element::u64, PartialShape::dynamic()); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + } } TEST_P(SqueezeTest, symbols_propagation) { @@ -321,9 +407,14 @@ TEST_P(SqueezeTest, symbols_propagation) { param = make_shared(element::f32, p_shape); const auto axes_node = std::make_shared(element::i32, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + } } using SqueezeShapeTests = SqueezeTest; @@ -336,10 +427,16 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_no_axes, TEST_P(SqueezeShapeTests, shape_dimension_propagation_const_axis_i64) { param = std::make_shared(element::f64, p_shape.to_shape()); const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f64); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f64); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f64); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + } } using SqueezeNoAxesTest = SqueezeTest; @@ -350,10 +447,16 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_no_axes, PrintToStringParamName()); TEST_P(SqueezeNoAxesTest, partial_shape_dimension_propagation_no_axes) { - const auto squeeze = std::make_shared(param); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } using SqueezeScalarAxisTest = SqueezeTest; @@ -368,25 +471,35 @@ INSTANTIATE_TEST_SUITE_P( TEST_P(SqueezeScalarAxisTest, axis_value_as_vector) { const auto axes_node = std::make_shared(element::i32, Shape{}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } TEST_P(SqueezeScalarAxisTest, axis_value_as_integer) { const auto axes_node = std::make_shared(element::i32, Shape{}, axes.front()); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } using SqueezeBoundTest = UnSqueezeBoundTest; -INSTANTIATE_TEST_SUITE_P( - type_prop_bounds_propagate, - SqueezeBoundTest, +const auto test_values_in = Values(std::make_tuple(PartialShape::dynamic(6), PartialShape::dynamic(1)), std::make_tuple(PartialShape{Dimension(-1)}, PartialShape{Dimension(-1)}), std::make_tuple(PartialShape{Dimension::dynamic(), 8}, PartialShape{Dimension::dynamic()}), @@ -394,34 +507,136 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(PartialShape{Dimension(20, -1), Dimension::dynamic()}, PartialShape{{20, -1}}), std::make_tuple(PartialShape{Dimension(-1, 5), Dimension::dynamic()}, PartialShape{Dimension(-1, 5)}), std::make_tuple(PartialShape{15}, PartialShape{15}), - std::make_tuple(PartialShape{2, 6}, PartialShape{2})), - PrintToStringParamName()); + std::make_tuple(PartialShape{2, 6}, PartialShape{2})); + +INSTANTIATE_TEST_SUITE_P(type_prop_bounds_propagate, SqueezeBoundTest, test_values_in, PrintToStringParamName()); /** * \brief Check symbol and dynamic value propagation. * * Test use evaluate symbol, lower/upper. */ -TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value) { +TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value_squeeze_v0) { PartialShape symboled_shape = PartialShape{p_shape}; in_symbols = set_shape_symbols(symboled_shape); - constexpr auto et = element::i64; - const auto symboled_param = std::make_shared(et, symboled_shape); - const auto symboled_shape_of = std::make_shared(symboled_param); + const auto squeeze = create_squeeze(symboled_shape); + const auto bc = std::make_shared(param, squeeze); + + EXPECT_EQ(bc->get_output_partial_shape(0), exp_shape); + const auto symbols = get_shape_symbols(bc->get_output_partial_shape(0)); + EXPECT_THAT(symbols, ElementsAre(in_symbols.front())); +} - const auto zero = std::vector{0}; - const auto axis = std::make_shared(et, Shape{}, zero); - const auto indices = std::make_shared(et, Shape{}, zero); - const auto gather = std::make_shared(symboled_shape_of, indices, axis); - const auto axis_1 = std::make_shared(et, Shape{2}, std::vector{0, 1}); - const auto unsqueeze = std::make_shared(gather, axis_1); - const auto squeeze = std::make_shared(unsqueeze, axis); +/** + * \brief Check symbol and dynamic value propagation. + * + * Test use evaluate symbol, lower/upper. + */ +TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value_squeeze_v15) { + PartialShape symboled_shape = PartialShape{p_shape}; + + in_symbols = set_shape_symbols(symboled_shape); + const auto squeeze = create_squeeze(symboled_shape); const auto bc = std::make_shared(param, squeeze); EXPECT_EQ(bc->get_output_partial_shape(0), exp_shape); const auto symbols = get_shape_symbols(bc->get_output_partial_shape(0)); EXPECT_THAT(symbols, ElementsAre(in_symbols.front())); } + +using SqueezeAxesDynamicRankTestParam = decltype(std::tuple_cat(SqueezeTypePropTestParam{}, std::make_tuple(false))); +class SqueezeAxesDynamicRank : public ::testing::TestWithParam { +protected: + ov::PartialShape p_shape{}, exp_shape{}; + std::vector axes{}; + bool allow_axis_skip{}; +}; + +INSTANTIATE_TEST_SUITE_P( + SqueezeAxesDynamicRankTests, + SqueezeAxesDynamicRank, + ::testing::Values( + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{}, PartialShape::dynamic(), false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0}, PartialShape{2, -1, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0}, PartialShape{2, -1, 4}, true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{2}, PartialShape{1, 2, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{2}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0, 2}, PartialShape{2, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0, 2}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{1}, PartialShape{1, 2, -1, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{1}, PartialShape{1, 2, -1, 4}, true), + + std::make_tuple(PartialShape{2, 4}, std::vector{1}, PartialShape{2, 4}, false), + std::make_tuple(PartialShape{2, 4}, std::vector{1}, PartialShape{2, 4}, true), + + std::make_tuple(PartialShape{2, {3, 5}}, std::vector{}, PartialShape{2, {3, 5}}, false), + std::make_tuple(PartialShape{2, {3, 5}}, std::vector{}, PartialShape{2, {3, 5}}, true), + + std::make_tuple(PartialShape{1, 2, -1}, std::vector{0, 1}, PartialShape{2, -1}, false), + std::make_tuple(PartialShape{1, 2, -1}, std::vector{0, 1}, PartialShape{2, -1}, true), + + std::make_tuple(PartialShape{1, 2, -1}, std::vector{1}, PartialShape{1, 2, -1}, false), + std::make_tuple(PartialShape{1, 2, -1}, std::vector{1}, PartialShape{1, 2, -1}, true), + + std::make_tuple(PartialShape{1, 1, -1}, std::vector{0, 1}, PartialShape{-1}, false), + std::make_tuple(PartialShape{1, 1, -1}, std::vector{0, 1}, PartialShape{-1}, true), + + std::make_tuple(PartialShape{1, 1, -1}, std::vector{1}, PartialShape{1, -1}, false), + std::make_tuple(PartialShape{1, 1, -1}, std::vector{1}, PartialShape{1, -1}, true), + + std::make_tuple(PartialShape{1, 2, 3}, std::vector{}, PartialShape{2, 3}, false), + std::make_tuple(PartialShape{1, 2, 3}, std::vector{}, PartialShape{2, 3}, true))); + +TEST_P(SqueezeAxesDynamicRank, squeeze_axes_dynamic_rank_param) { + const auto& params = GetParam(); + p_shape = std::get<0>(params); + axes = std::get<1>(params); + exp_shape = std::get<2>(params); + allow_axis_skip = std::get<3>(params); + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::u64, Shape{axes.size()}, axes); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +TEST(SqueezeDynamicAxis, squeeze_dynamic_non_const_single_axis) { + auto p_shape = PartialShape{1, 2, -1, 4}; + auto exp_shape = PartialShape::dynamic(); + auto allow_axis_skip = true; + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::i32, Shape{1}); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +TEST(SqueezeDynamicAxis, squeeze_dynamic_non_const_axes) { + auto p_shape = PartialShape{1, 2, -1, 4}; + auto exp_shape = PartialShape::dynamic(); + auto allow_axis_skip = true; + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::i32, PartialShape{-1}); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +} // namespace diff --git a/src/core/tests/visitors/op/squeeze.cpp b/src/core/tests/visitors/op/squeeze.cpp index 6eb1674b26329a..be596a5fb1dc67 100644 --- a/src/core/tests/visitors/op/squeeze.cpp +++ b/src/core/tests/visitors/op/squeeze.cpp @@ -6,7 +6,16 @@ #include "unary_ops.hpp" +namespace v0 { using Types = ::testing::Types, UnaryOperatorType>; INSTANTIATE_TYPED_TEST_SUITE_P(visitor_without_attribute, UnaryOperatorVisitor, Types, UnaryOperatorTypeName); +} // namespace v0 + +namespace v15 { +using Types = ::testing::Types, + UnaryOperatorTypeWithAttribute>; + +INSTANTIATE_TYPED_TEST_SUITE_P(visitor_single_attribute, UnaryOperatorVisitor, Types, UnaryOperatorTypeName); +} // namespace v15 diff --git a/src/core/tests/visitors/op/unary_ops.hpp b/src/core/tests/visitors/op/unary_ops.hpp index 3bef2429983e9f..6cc2afda62e253 100644 --- a/src/core/tests/visitors/op/unary_ops.hpp +++ b/src/core/tests/visitors/op/unary_ops.hpp @@ -9,12 +9,17 @@ #include "openvino/op/parameter.hpp" #include "visitors/visitors.hpp" -template +template class UnaryOperatorType { public: using op_type = T; static constexpr ov::element::Type_t element_type = ELEMENT_TYPE; + static constexpr int expected_attr_count = ATTRIBUTES_COUNT; }; + +template +using UnaryOperatorTypeWithAttribute = UnaryOperatorType; + template class UnaryOperatorVisitor : public testing::Test {}; @@ -43,7 +48,7 @@ TYPED_TEST_P(UnaryOperatorVisitor, No_Attribute_4D) { EXPECT_NO_THROW(auto g_op_func = ov::as_type_ptr(builder.create())); - const auto expected_attr_count = 0; + const auto expected_attr_count = TypeParam::expected_attr_count; EXPECT_EQ(builder.get_value_map_size(), expected_attr_count); } diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp index bb2d5e5e84b267..1921169f83afd7 100644 --- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp @@ -407,6 +407,7 @@ using IStaticShapeInferFactory = template <> const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{ // opset15 + _OV_OP_SHAPE_INFER_MASK_REG(op::v15::Squeeze, ShapeInferTA, util::bit::mask(1)), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::SearchSorted, ShapeInferTA, util::bit::mask()), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorUnpack, ShapeInferTA, util::bit::mask(0)), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorPack, ShapeInferTA, util::bit::mask(0, 1)), diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp index 69da74b10a2f45..5f790135780013 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp @@ -16,14 +16,16 @@ using namespace ov; using namespace ov::intel_cpu; using namespace testing; -class SqueezeStaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { +namespace v0 { + +class SqueezeV0StaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { protected: void SetUp() override { output_shapes = ShapeVector(1); } }; -TEST_F(SqueezeStaticShapeInferenceAssertTest, no_axes) { +TEST_F(SqueezeV0StaticShapeInferenceAssertTest, no_axes) { const auto arg = std::make_shared(element::f64, PartialShape{-1, -1}); const auto axes = std::make_shared(element::i64, PartialShape{1}); const auto op = make_op(arg, axes); @@ -35,7 +37,7 @@ TEST_F(SqueezeStaticShapeInferenceAssertTest, no_axes) { HasSubstr("Check 'constant != nullptr'")); } -TEST_F(SqueezeStaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { +TEST_F(SqueezeV0StaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { const auto arg = std::make_shared(element::f64, Shape{2, 1, 3, 1}); const auto axes = std::make_shared(element::i64, Shape{2}); const auto op = make_op(arg, axes); @@ -52,11 +54,11 @@ using TestParams = std::tuple; -class SqueezeStaticShapeInferenceTest : public SqueezeStaticShapeInferenceAssertTest, +class SqueezeV0StaticShapeInferenceTest : public SqueezeV0StaticShapeInferenceAssertTest, public WithParamInterface { protected: void SetUp() override { - SqueezeStaticShapeInferenceAssertTest::SetUp(); + SqueezeV0StaticShapeInferenceAssertTest::SetUp(); std::tie(input_shapes, axes, exp_shape) = GetParam(); output_shapes = ShapeVector(1); @@ -68,7 +70,7 @@ class SqueezeStaticShapeInferenceTest : public SqueezeStaticShapeInferenceAssert }; INSTANTIATE_TEST_SUITE_P(1d_shapes, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{1}, {1}}, std::vector{-1}, StaticShape({})), make_tuple(ShapeVector{{6}, {1}}, std::vector{-1}, StaticShape({6})), make_tuple(ShapeVector{{1}, {1}}, std::vector{0}, StaticShape({}))), @@ -76,7 +78,7 @@ INSTANTIATE_TEST_SUITE_P(1d_shapes, INSTANTIATE_TEST_SUITE_P( multi_dim_shapes, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{1, 2, 3, 1}, {2}}, std::vector{0, 3}, StaticShape({2, 3})), make_tuple(ShapeVector{{2, 1, 1, 4}, {2}}, std::vector{2, 1}, StaticShape({2, 4})), make_tuple(ShapeVector{{2, 1, 1, 4, 1}, {2}}, std::vector{0, 1, -2, -1}, StaticShape({2, 1, 4})), @@ -92,7 +94,7 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( multi_dim_shapes_repeated_axis, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{2, 1, 3}, {2}}, std::vector{1, 1}, StaticShape({2, 3})), make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1}, StaticShape({3, 2})), make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1, -1}, StaticShape({3, 2})), @@ -100,7 +102,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(ShapeVector{{2, 6, 7, 8, 1}, {2}}, std::vector{-1, -1}, StaticShape({2, 6, 7, 8}))), PrintToStringParamName()); -TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_empty_const_map) { +TEST_P(SqueezeV0StaticShapeInferenceTest, shape_inference_empty_const_map) { const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); const auto op = make_op(arg, axes_node); @@ -109,8 +111,8 @@ TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_empty_const_map) { ASSERT_EQ(output_shapes.front(), exp_shape); } -TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_with_const_map) { - const auto axes_node = std::make_shared(element::i64, Shape{1}); +TEST_P(SqueezeV0StaticShapeInferenceTest, shape_inference_with_const_map) { + const auto axes_node = std::make_shared(element::i64, ov::PartialShape::dynamic()); const auto op = make_op(arg, axes_node); const auto axes_tensor = axes.empty() ? ov::Tensor(element::i64, ov::Shape{axes.size()}) @@ -121,3 +123,115 @@ TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_with_const_map) { ASSERT_EQ(output_shapes.front(), exp_shape); } + +} // namespace v0 + +namespace v15 { + +class SqueezeV15StaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes = ShapeVector(1); + } +}; + +TEST_F(SqueezeV15StaticShapeInferenceAssertTest, no_axes) { + const auto arg = std::make_shared(element::f64, PartialShape{-1, -1}); + const auto axes = std::make_shared(element::i64, PartialShape{1}); + const auto op = make_op(arg, axes); + + input_shapes = ShapeVector{{5, 6}, axes->get_shape()}; + + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes), + NodeValidationFailure, + HasSubstr("Check 'constant != nullptr'")); +} + +TEST_F(SqueezeV15StaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { + const auto arg = std::make_shared(element::f64, Shape{2, 1, 3, 1}); + const auto axes = std::make_shared(element::i64, Shape{2}); + const auto op = make_op(arg, axes); + + input_shapes = ShapeVector{arg->get_shape(), axes->get_shape()}; + + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes), + NodeValidationFailure, + HasSubstr("Check 'constant != nullptr'")); +} + +using TestParams = std::tuple, // Squeeze axes + StaticShape // Expected shape + >; + +class SqueezeV15StaticShapeInferenceTest : public SqueezeV15StaticShapeInferenceAssertTest, + public WithParamInterface { +protected: + void SetUp() override { + SqueezeV15StaticShapeInferenceAssertTest::SetUp(); + std::tie(input_shapes, axes, exp_shape) = GetParam(); + + output_shapes = ShapeVector(1); + arg = std::make_shared(element::f32, input_shapes.front().get_shape()); + } + + std::vector axes; + std::shared_ptr arg; +}; + +INSTANTIATE_TEST_SUITE_P(1d_shapes, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{1}, {1}}, std::vector{-1}, StaticShape({})), + make_tuple(ShapeVector{{6}, {1}}, std::vector{-1}, StaticShape({6})), + make_tuple(ShapeVector{{1}, {1}}, std::vector{0}, StaticShape({}))), + PrintToStringParamName()); + +INSTANTIATE_TEST_SUITE_P( + multi_dim_shapes, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{1, 2, 3, 1}, {2}}, std::vector{0, 3}, StaticShape({2, 3})), + make_tuple(ShapeVector{{2, 1, 1, 4}, {2}}, std::vector{2, 1}, StaticShape({2, 4})), + make_tuple(ShapeVector{{2, 1, 1, 4, 1}, {2}}, std::vector{0, 1, -2, -1}, StaticShape({2, 1, 4})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{0, 2, 4}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{4, 2, 0}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{2, 0, 4}, StaticShape({3, 2})), + make_tuple(ShapeVector{{10, 1, 0, 1, 3, 1, 1}, {4}}, + std::vector{1, -1, 3, -2}, + StaticShape({10, 0, 3})), + make_tuple(ShapeVector{{10, 1, 0, 1, 3, 1, 1}, {}}, std::vector{}, StaticShape({10, 0, 3})), + make_tuple(ShapeVector{{2, 1, 7, 8, 3}, {1}}, std::vector{1}, StaticShape({2, 7, 8, 3}))), + PrintToStringParamName()); + +INSTANTIATE_TEST_SUITE_P( + multi_dim_shapes_repeated_axis, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{2, 1, 3}, {2}}, std::vector{1, 1}, StaticShape({2, 3})), + make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1}, StaticShape({3, 2})), + make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1, -1}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{2, -1, 2, -1, 0}, StaticShape({3, 2})), + make_tuple(ShapeVector{{2, 6, 7, 8, 1}, {2}}, std::vector{-1, -1}, StaticShape({2, 6, 7, 8}))), + PrintToStringParamName()); + +TEST_P(SqueezeV15StaticShapeInferenceTest, shape_inference_empty_const_map) { + const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); + const auto op = make_op(arg, axes_node); + + output_shapes = shape_inference(op.get(), input_shapes); + + ASSERT_EQ(output_shapes.front(), exp_shape); +} + +TEST_P(SqueezeV15StaticShapeInferenceTest, shape_inference_with_const_map) { + const auto axes_node = std::make_shared(element::i64, ov::PartialShape::dynamic()); + const auto op = make_op(arg, axes_node); + + const auto axes_tensor = axes.empty() ? ov::Tensor(element::i64, ov::Shape{axes.size()}) + : ov::Tensor(element::i64, ov::Shape{axes.size()}, axes.data()); + const auto constant_data = std::unordered_map{{1, axes_tensor}}; + + output_shapes = shape_inference(op.get(), input_shapes, constant_data); + + ASSERT_EQ(output_shapes.front(), exp_shape); +} + +} // namespace v15 diff --git a/src/plugins/template/tests/functional/op_reference/squeeze.cpp b/src/plugins/template/tests/functional/op_reference/squeeze.cpp index 8bf1902c403a1b..e397c7c403cec8 100644 --- a/src/plugins/template/tests/functional/op_reference/squeeze.cpp +++ b/src/plugins/template/tests/functional/op_reference/squeeze.cpp @@ -63,7 +63,7 @@ struct SqueezeParams { bool m_axes_node; }; -class ReferenceSqueezeLayerTest : public testing::TestWithParam, public CommonReferenceTest { +class ReferenceSqueezeLayerTestBase : public testing::TestWithParam, public CommonReferenceTest { public: void SetUp() override { const auto params = GetParam(); @@ -90,7 +90,12 @@ class ReferenceSqueezeLayerTest : public testing::TestWithParam, } private: - static std::shared_ptr CreateFunction(const SqueezeParams& params) { + virtual std::shared_ptr CreateFunction(const SqueezeParams&) = 0; +}; + +class ReferenceSqueezeLayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { const auto in = std::make_shared(params.m_input_type, params.m_input_shape); std::shared_ptr axes_node = NULL; std::shared_ptr squeeze = NULL; @@ -180,4 +185,58 @@ INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, ::testing::ValuesIn(generateCombinedParamsForSqueeze()), ReferenceSqueezeLayerTest::getTestCaseName); +class ReferenceSqueezeV15LayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { + const auto in = std::make_shared(params.m_input_type, params.m_input_shape); + std::shared_ptr axes_node = NULL; + std::shared_ptr squeeze = NULL; + if (params.m_axes_node) { + axes_node = + std::make_shared(params.m_axes_type, params.m_axes_shape, params.m_axes_value.data()); + squeeze = std::make_shared(in, axes_node); + } else { + squeeze = std::make_shared(in); + } + + return std::make_shared(squeeze, ParameterVector{in}); + } +}; + +TEST_P(ReferenceSqueezeV15LayerTest, CompareWithHardcodedRefs) { + Exec(); +} + +INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, + ReferenceSqueezeV15LayerTest, + ::testing::ValuesIn(generateCombinedParamsForSqueeze()), + ReferenceSqueezeV15LayerTest::getTestCaseName); + +class ReferenceSqueezeV15AttributeSetLayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { + const auto in = std::make_shared(params.m_input_type, params.m_input_shape); + std::shared_ptr axes_node = NULL; + std::shared_ptr squeeze = NULL; + if (params.m_axes_node) { + axes_node = + std::make_shared(params.m_axes_type, params.m_axes_shape, params.m_axes_value.data()); + squeeze = std::make_shared(in, axes_node, true); + } else { + squeeze = std::make_shared(in, true); + } + + return std::make_shared(squeeze, ParameterVector{in}); + } +}; + +TEST_P(ReferenceSqueezeV15AttributeSetLayerTest, CompareWithHardcodedRefs) { + Exec(); +} + +INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, + ReferenceSqueezeV15AttributeSetLayerTest, + ::testing::ValuesIn(generateCombinedParamsForSqueeze()), + ReferenceSqueezeV15AttributeSetLayerTest::getTestCaseName); + } // namespace diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp index f38427b7b192ed..bcb259cd49b60f 100644 --- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp @@ -1193,7 +1193,15 @@ std::shared_ptr generate(const std::shared_ptr & const auto axes = std::make_shared(ov::element::i64, ov::Shape{2}, std::vector{0, 2}); auto Node = std::make_shared(params.at(0), axes); ov::ResultVector results{std::make_shared(Node)}; - return std::make_shared(results, params, "SqueezeGraph"); + return std::make_shared(results, params, "SqueezeV0Graph"); +} + +std::shared_ptr generate(const std::shared_ptr &node) { + ov::ParameterVector params{std::make_shared(ov::element::f32, ov::Shape{{1, 4, 1, 1, 2}})}; + const auto axes = std::make_shared(ov::element::i64, ov::Shape{2}, std::vector{0, 2}); + auto Node = std::make_shared(params.at(0), axes); + ov::ResultVector results{std::make_shared(Node)}; + return std::make_shared(results, params, "SqueezeV15Graph"); } std::shared_ptr generate(const std::shared_ptr &node) { diff --git a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp index 2a953ad27740e6..e097a3ab957d13 100644 --- a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp +++ b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp @@ -7,7 +7,11 @@ #include "gmock/gmock.h" #include "openvino/core/dimension.hpp" #include "openvino/core/partial_shape.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/gather.hpp" #include "openvino/op/parameter.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/unsqueeze.hpp" #define EXPECT_HAS_SUBSTRING(haystack, needle) EXPECT_PRED_FORMAT2(testing::IsSubstring, needle, haystack) @@ -56,6 +60,23 @@ class UnSqueezeBoundTest : public testing::WithParamInterface(ov::element::f32, ov::PartialShape{1}); } + template + auto create_squeeze(ov::PartialShape symboled_shape) -> std::shared_ptr { + constexpr auto et = ov::element::i64; + const auto symboled_param = std::make_shared(et, symboled_shape); + const auto symboled_shape_of = std::make_shared(symboled_param); + + const auto zero = std::vector{0}; + const auto axis = std::make_shared(et, ov::Shape{}, zero); + const auto indices = std::make_shared(et, ov::Shape{}, zero); + const auto gather = std::make_shared(symboled_shape_of, indices, axis); + const auto axis_1 = std::make_shared(et, ov::Shape{2}, std::vector{0, 1}); + const auto unsqueeze = std::make_shared(gather, axis_1); + const auto squeeze = std::make_shared(unsqueeze, axis); + + return squeeze; + } + ov::TensorSymbol in_symbols; }; From 1ec99ee520cdd6fe5dd25c83a36d440170b4f888 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Wed, 30 Oct 2024 20:37:51 +0100 Subject: [PATCH 113/233] [PT FE] Support torch 2.5.1 (#27334) ### Details: - *Support torch 2.5.1* - *Update test requirements* ### Tickets: - *ticket-id* Signed-off-by: Maxim Vafin --- tests/constraints.txt | 2 +- tests/requirements_pytorch | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/constraints.txt b/tests/constraints.txt index c6e2e5e65f96fe..2272151565ca8a 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -28,5 +28,5 @@ networkx<=3.3 flax<=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torch~=2.2.0; platform_system == "Darwin" and platform_machine == "x86_64" diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index c2873210003b7d..56446beba12600 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -3,13 +3,13 @@ # optimum still requires numpy<2.0.0 numpy==1.26.4; python_version < "3.12" numpy==2.1.1; python_version >= "3.12" -torch==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torch==2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu -torchvision==0.20.0; platform_system != "Darwin" or platform_machine != "x86_64" +torchvision==0.20.1; platform_system != "Darwin" or platform_machine != "x86_64" torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64" -torchaudio==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torchaudio==2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 From a6eb53506e9b7fe6a29f128f16f3988da870bec6 Mon Sep 17 00:00:00 2001 From: virajwad <84867530+virajwad@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:50:41 -0700 Subject: [PATCH 114/233] [ONNX] Added QuickGelu from com.microsoft domain (#27238) ### Details: - Microsoft Contrib Operator "QuickGelu" for ONNX RT ### Tickets: - CVS-152783 --------- Co-authored-by: Georgy Krivoruchko --- .../src/op/com.microsoft/quick_gelu.cpp | 58 +++++++++++++++++++ .../models/com.microsoft/quick_gelu.prototxt | 52 +++++++++++++++++ .../tests/onnx_import_com_microsoft.in.cpp | 26 +++++++++ 3 files changed, 136 insertions(+) create mode 100644 src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp create mode 100644 src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp new file mode 100644 index 00000000000000..c4144be9b5ff44 --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "core/operator_set.hpp" +#include "exceptions.hpp" +#include "openvino/frontend/exception.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/sigmoid.hpp" +#include "utils/common.hpp" + +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace onnx { +namespace com_microsoft { +namespace opset_1 { +ov::OutputVector quick_gelu(const ov::frontend::onnx::Node& node) { + // Original Documentation: + // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.QuickGelu + // Goal: Compute x * Sigmoid(alpha * x) + common::default_op_checks(node, 1); + + const auto inputs = node.get_ov_inputs(); + const auto& x = inputs[0]; + + // Constrain input type to float16, float, double (f64), bfloat16 + auto element_type = x.get_element_type(); + CHECK_VALID_NODE(node, + element_type == ov::element::f16 || element_type == ov::element::f32 || + element_type == ov::element::f64 || element_type == ov::element::bf16, + "Unsupported input x type, accepted FP16, FP32, FP64, BFP16 but got: ", + element_type); + + // Get attribute from node + const float alpha = node.get_attribute_value("alpha"); + + // Numpy broadcasting rule is automatically applied with mismatched shapes according to: + // https://docs.openvino.ai/2022.3/openvino_docs_ops_arithmetic_Multiply_1.html "Tensor with dimension of size 1 + // will be implicitly broadcasted to match the size of the second tensor." Convert alpha to tensor with size 1 + const auto alpha_tensor = std::make_shared(ov::element::f32, Shape{1}, alpha); + + auto alpha_x = std::make_shared(alpha_tensor, x); + auto sig_alpha_x = std::make_shared(alpha_x); + auto result = std::make_shared(x, sig_alpha_x); + + return {result}; +} // func end + +ONNX_OP("QuickGelu", OPSET_SINCE(1), com_microsoft::opset_1::quick_gelu, MICROSOFT_DOMAIN); + +} // namespace opset_1 +} // namespace com_microsoft +} // namespace onnx +} // namespace frontend +} // namespace ov diff --git a/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt b/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt new file mode 100644 index 00000000000000..4fb110fd485833 --- /dev/null +++ b/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt @@ -0,0 +1,52 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + name: "test_quick_gelu" + node { + input: "X" + output: "Y" + op_type: "QuickGelu" + attribute { + name: "alpha" + f: 0.9974269270896912 + type: FLOAT + } + domain: "com.microsoft" + } + input { + name: "X" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 5 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 5 + } + } + } + } + } +} +opset_import { + domain: "com.microsoft" + version: 1 +} \ No newline at end of file diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index da8189926a4546..900fc025d8d9ab 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -1330,3 +1330,29 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17) { } test_case.run(); } + +OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) { + const auto model = convert_model("com.microsoft/quick_gelu.onnx"); + auto test_case = ov::test::TestCase(model, s_device); + + const std::vector input_X{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + const std::vector output{0.7305524f, + 1.7605114f, + 2.8566725f, + 3.9273243f, + 4.9661055f, + 5.984934f, + 6.9935064f, + 7.997261f, + 8.998864f, + 9.999535f}; + + test_case.add_input(Shape{2, 5}, input_X); + test_case.add_expected_output(Shape{2, 5}, output); + + if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) { + test_case.run_with_tolerance_as_fp(0.0001f); + } else { + test_case.run(); + } +} From e44ea54991ffd6fe3b1e6b085676672f0fb4dc7b Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Wed, 30 Oct 2024 21:58:17 +0000 Subject: [PATCH 115/233] [NPUW] Revert check in preemptive tensor set (#27345) https://github.com/openvinotoolkit/openvino/pull/27313 --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 0070e6be2d2041..26363e66e55d2a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -378,7 +378,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[i]; - if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // FIXME: figure out our cases and if this should be replaced with && + // Note: replaced_by is utilized below unconditionally + if (!comp_model_desc.compiled_model || !comp_model_desc.replaced_by) { continue; } const auto real_idx = comp_model_desc.replaced_by.value(); From b5a59532a8b36a8bcc5066d78c3c83ff258f5f01 Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 31 Oct 2024 09:49:13 +0900 Subject: [PATCH 116/233] [GPU] Support int8 dyn-quan FC (#27027) ### Details: - Support FC dynamic quantize for 8Bit Asym model - Enable SLM for 8bit weight ### Tickets: - CVS-152990 --------- Signed-off-by: Min, Byung-il Signed-off-by: Min, Byungil --- .../fully_connected_gpu_bf_tiled.cl | 372 +++++++++++------- .../include/batch_headers/int4_utils.cl | 7 + .../fully_connected_kernel_bf_tiled.cpp | 93 +++-- .../test_cases/fully_connected_gpu_test.cpp | 160 +++++++- 4 files changed, 456 insertions(+), 176 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 70c55bfb73b8f5..ef4cc76650e0f3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -22,36 +22,47 @@ #if FC_KERNEL_DYNAMIC_QUANTIZE KERNEL(quantize_input)( const __global INPUT0_TYPE* input, - __global char* quantized_input, - __global INPUT0_TYPE* de_quan_scale) { + __global DQ_TYPE* quantized_input, + __global INPUT0_TYPE* quan_var +) { const uint offset = get_global_id(0); const uint input_offset = offset * QUANTIZE_GROUP_SIZE; const uint quantize_block = QUANTIZE_GROUP_SIZE / 4; - half4 input_0[quantize_block]; - char4 quantized_value[quantize_block]; - half max[quantize_block]; + MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_0[quantize_block]; + MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value[quantize_block]; + INPUT0_TYPE max[quantize_block]; unroll_for (uint i = 0 ; i < quantize_block ; ++i) { input_0[i] = vload4(0, &input[input_offset + i * 4]); max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3]))); } - half max_value = 0.001; - for (uint i = 0 ; i < quantize_block; i+=8) { - half temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])), + INPUT0_TYPE max_value = 0.001; + for (uint i = 0 ; i < quantize_block ; i+=8) { + INPUT0_TYPE temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])), fmax(fmax(max[i+4], max[i+5]), fmax(max[i+6], max[i+7]))); max_value = fmax(max_value, temp); } - half quan_scale = max_value / 128; - - unroll_for (uint i = 0 ; i < quantize_block ; ++i) { - quantized_value[i] = CAT(convert_, MAKE_VECTOR_TYPE(char, INPUT_LOAD_SIZE))(input_0[i] / (half4)quan_scale); + half quan_scale = (half)max_value / 127; + #if COMPRESSED_WEIGHTS_INT8 + half quantized_sum = 0; + #endif + for (uint i = 0 ; i < quantize_block ; ++i) { + half4 buff = input_0[i] / (half4)quan_scale; + quantized_value[i] = CAT(CAT(convert_, MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE)), _rte)(buff); + #if COMPRESSED_WEIGHTS_INT8 + quantized_sum += (buff[0] + buff[1] + buff[2] + buff[3]); + #endif vstore4(quantized_value[i], 0, &quantized_input[input_offset + i * 4]); } - de_quan_scale[offset] = quan_scale; + // Pair of quantizing_scale and quantized activation_sum for each group + quan_var[offset * 2] = quan_scale; + #if COMPRESSED_WEIGHTS_INT8 + quan_var[(offset * 2) + 1] = quantized_sum; + #endif } #else // !FC_KERNEL_DYNAMIC_QUANTIZE @@ -189,6 +200,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #else uint gid = (uint)get_group_id(0); #endif + uint sglid = (uint)get_sub_group_local_id(); // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV. @@ -212,10 +224,9 @@ inline void FUNC(fc_bf_tiled_kernel_default)( ACCUMULATOR_VEC_TYPE acc[TILE_B] = { }; INPUT_VEC_TYPE in_0[TILE_B] = { }; -#if !USE_SLM - FILTER_VEC_TYPE wei = 0; -#endif - + #if !USE_SLM || !COMPRESSED_WEIGHTS_INT4 + FILTER_VEC_TYPE wei = 0; + #endif #if OUTPUT_3D uint out_b0 = out_b / OUTPUT_FEATURE_NUM; @@ -743,19 +754,31 @@ inline void FUNC(fc_bf_tiled_kernel_default)( // ===================================================================================================================================== } + + + // Dyc Quantize #if USE_SLM && DYNAMIC_QUANTIZE -#define PACKED_DQ_TYPE int -#define DQ_VEC_TYPE MAKE_VECTOR_TYPE(DQ_TYPE, TILE_IFM) -#define DQ_SLM_FILTER_VEC MAKE_VECTOR_TYPE(DQ_TYPE, 4) + +#if COMPRESSED_WEIGHTS_INT4 + #define SLM_WEIGHT_TYPE DQ_TYPE +#else + #define SLM_WEIGHT_TYPE FILTER_TYPE +#endif + +#define PACKED_DQ_TYPE uint +#define ACCUM_DQ_TYPE int #define DQ_SLM_FILTER_PACKED_VEC MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE) -#define DQ_SLM_FILTER_UNPACKED_VEC MAKE_VECTOR_TYPE(DQ_TYPE, FILTER_ELEMENTS_PER_LOAD) -#define DQ_FILTER_VEC_TYPE MAKE_VECTOR_TYPE(DQ_TYPE, TILE_K_OFM) +#define SLM_WEIGHT_VEC MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, INPUT_LOAD_SIZE) +#define SLM_WEIGHT_UNPACKED_VEC MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, FILTER_ELEMENTS_PER_LOAD) +#define WEIGHT_VEC_TYPE MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, TILE_K_OFM) +#define MAKE_DQ_TYPE_VEC(x) MAKE_VECTOR_TYPE(DQ_TYPE, x) #define TO_DQ_TYPE(x) CAT(CAT(convert_, DQ_TYPE),_sat)(x) #define TO_DQ_VEC_TYPE(x) CAT(convert_, DQ_VEC_TYPE)(x) -#define TO_DQ_SLM_FILTER_UNPACKED_VEC(x) CAT(convert_, DQ_SLM_FILTER_UNPACKED_VEC)(x) -#define TO_DQ_FILTER_VEC_TYPE(x) CAT(convert_, DQ_FILTER_VEC_TYPE)(x) +#define TO_ACCUM_DQ_TYPE(x) CAT(convert_, ACCUM_DQ_TYPE)(x) +#define TO_SLM_WEIGHT_UNPACKED_VEC(x) CAT(convert_, SLM_WEIGHT_UNPACKED_VEC)(x) +#define TO_WEIGHT_VEC_TYPE(x) CAT(convert_, WEIGHT_VEC_TYPE)(x) #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) @@ -764,8 +787,8 @@ inline void FUNC(fc_bf_tiled_kernel_default)( inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, - __global char* quantized_input, - __global INPUT0_TYPE* scale, + __global DQ_TYPE* quantized_input, + __global INPUT0_TYPE* quan_var, // pair of params for each quantizing group : scale, activation_sum #if DECOMPRESSION_SCALE_TERM const __global DECOMPRESSION_SCALE_TYPE* decompression_scale, #endif @@ -774,7 +797,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif __global OUTPUT_TYPE* output, const __global FILTER_TYPE* weights - , __local int* wei_local_mem + , __local uint* wei_local_mem #if BIAS_TERM , const __global BIAS_TYPE* biases #endif @@ -801,28 +824,32 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( uint out_f = gid * (TILE_OFM * SIMD); uint out_b = LWS_BATCHES * TILE_B * (uint)get_group_id(2) + local_id * TILE_B; -#if OUTPUT_3D - uint out_b0 = out_b / OUTPUT_FEATURE_NUM; - uint out_b1 = out_b % OUTPUT_FEATURE_NUM; - uint input_offset = out_b0 * INPUT0_BATCH_PITCH + out_b1 * INPUT0_FEATURE_PITCH + INPUT0_OFFSET; -#else - uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; -#endif + #if OUTPUT_3D + uint out_b0 = out_b / OUTPUT_FEATURE_NUM; + uint out_b1 = out_b % OUTPUT_FEATURE_NUM; + uint input_offset = out_b0 * INPUT0_BATCH_PITCH + out_b1 * INPUT0_FEATURE_PITCH + INPUT0_OFFSET; + #else + uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; + #endif -#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - const int power_of_two_for_simd = 5; - const int power_of_two_for_osv = 6; - const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv); - const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1); - const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd; - // out_f(32) : 0 * osv_weight_stride + 32; - // out_f(64) : 64 * osv_weight_stride + 0; - // out_f(128) : 64 * osv_weight_stride + 32; - // ... - uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset; -#else - uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); -#endif + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + const int power_of_two_for_simd = 5; + const int power_of_two_for_osv = 6; + const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv); + const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1); + const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd; + // out_f(32) : 0 * osv_weight_stride + 32; + // out_f(64) : 64 * osv_weight_stride + 0; + // out_f(128) : 64 * osv_weight_stride + 32; + // ... + uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset; + #else + uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); + #endif + #else + uint weights_offset = out_f * INPUT_ELEMENTS_COUNT; + #endif ACCUMULATOR_VEC_TYPE acc[TILE_B] = { }; @@ -831,38 +858,42 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( PACKED_DQ_TYPE packed_in_0[HALF_TILE_B] = { }; // Packing char4 inputs to 1 integer INPUT0_TYPE de_quantize_scale[TILE_B]; -#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 - #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); - #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_scale = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_SCALE_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; + #if COMPRESSED_WEIGHTS_INT8 + INPUT0_TYPE activation_sum[TILE_B] = { }; #endif - ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); -#endif + #if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 + #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); + #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_scale = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_SCALE_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; + #endif -#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR - #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); - #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_zp = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_ZP_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; + ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); + #endif + + #if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR + #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); + #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_zp = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_ZP_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; + #endif + ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); #endif - ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); -#endif // ===================================================================================================================================== // Main computation loop @@ -871,7 +902,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction - const uint scale_pitch = TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE; + const uint scale_pitch = (TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE); MAKE_VECTOR_TYPE(int, TILE_B) acc_tmp[TILE_OFM] = { }; __attribute__((opencl_unroll_hint(1))) for (uint ni = 0; ni < iterations; ++ni) { @@ -881,14 +912,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // Load quantizing info from pre-quantizing kernel tiled_input_0[bi] = vload4(0, &quantized_input[in_offset]); // Packing : Get 4(B)x4(K) integer vector (packing to 4x1 vector) - packed_in_0[bi] = as_int(tiled_input_0[bi]); + packed_in_0[bi] = as_uint(tiled_input_0[bi]); // Next batch in_offset += (TILE_IN_B_PITCH * 2); #if NUM_LOOP_IN_DYN_QUAN_GROUP == 1 - de_quantize_scale[bi * 2] = scale[scale_offset]; - de_quantize_scale[bi * 2 + 1] = scale[scale_offset+ scale_pitch]; + de_quantize_scale[bi * 2] = quan_var[scale_offset * 2]; + de_quantize_scale[bi * 2 + 1] = quan_var[scale_offset * 2 + scale_pitch * 2]; + #if COMPRESSED_WEIGHTS_INT8 + // Need additional accumulation of quantized activation along the dyn-quan group + // to use i8 multiplier for int8 weight + activation_sum[bi * 2] = quan_var[scale_offset * 2 + 1]; + activation_sum[bi * 2 + 1] = quan_var[scale_offset * 2 + 1 + scale_pitch * 2]; + #endif scale_offset += (scale_pitch * 2); #endif } @@ -896,7 +933,10 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #if NUM_LOOP_IN_DYN_QUAN_GROUP > 1 if (ni % NUM_LOOP_IN_DYN_QUAN_GROUP == 0) { unroll_for (uint bi = 0; bi < TILE_B; ++bi) { - de_quantize_scale[bi] = scale[scale_offset]; + de_quantize_scale[bi] = quan_var[scale_offset * 2]; + #if COMPRESSED_WEIGHTS_INT8 + activation_sum[bi] = quan_var[scale_offset * 2 + 1]; + #endif scale_offset += scale_pitch; } } @@ -916,49 +956,64 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( barrier(CLK_LOCAL_MEM_FENCE); #endif - __local int* char_slm_weight = (__local int*)wei_local_mem; + __local uint* char_slm_weight = (__local uint*)wei_local_mem; - #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2; + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2; + #else + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #endif #else - uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * TILE_K_OFM_PACKED; #endif uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2; - // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE + #if COMPRESSED_WEIGHTS_INT8 + ACCUMULATOR_TYPE wei_zp[TILE_OFM] = { }; + #endif + + // DQ_DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE unroll_for(uint load_iter = 0; load_iter < FILTER_LOAD_ITERS; ++load_iter) { - #if FILTER_LAYOUT_OS_IYX_OSV16 - SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); - SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16))); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; - // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking - dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); - dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); - #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); - SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD))); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp; - dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); - dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); - dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01; - dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45; - dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23; - dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67; - #else - SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed)); + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IYX_OSV16 + SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16))); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking + dq_wei_unpacked.s0123 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0))); + dq_wei_unpacked.s4567 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1))); + #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD))); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked_tmp; + dq_wei_unpacked_tmp.s0123 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0))); + dq_wei_unpacked_tmp.s4567 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1))); + dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01; + dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45; + dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23; + dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67; + #else + SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked = (UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed))); + #endif + #else // COMPRESSED_WEIGHTS_INT8 + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + WEIGHT_VEC_TYPE wei_packed = TO_WEIGHT_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_idx)); + dq_wei_unpacked.s0123 = wei_packed.s0246; + dq_wei_unpacked.s4567 = wei_packed.s1357; #endif - // Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled + // Calculate zero-point and scale only for DQ_DECOMPRESSION_SCALE_POST_OP enabled // Calculate weight : w = (w - dzp) * ds // if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO. - #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_TERM && !COMPRESSED_WEIGHTS_INT8 #if DECOMPRESSION_ZP_SCALAR - DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); + SLM_WEIGHT_UNPACKED_VEC dzp = (SLM_WEIGHT_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); dq_wei_unpacked -= dzp; #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 - DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + SLM_WEIGHT_TYPE* w = (SLM_WEIGHT_TYPE*)(&dq_wei_unpacked); const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE; unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { const uint offset_ofm = out_f + fi*SIMD + sglid; @@ -966,11 +1021,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE + kii; const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + (offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; - w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - TO_DQ_TYPE(decompression_zp[zp_offset]); + w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - CAT(CAT(convert_, SLM_WEIGHT_TYPE),_rte)(decompression_zp[zp_offset]); } } #else - DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + SLM_WEIGHT_TYPE* w = (SLM_WEIGHT_TYPE*)(&dq_wei_unpacked); unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) { w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - d_zps[fi % DECOMPRESSION_ZP_LENGTH]; @@ -979,29 +1034,58 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif #endif + #if COMPRESSED_WEIGHTS_INT8 + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_SCALAR + wei_zp[fi] = (TO_ACCUMULATOR_TYPE)(DECOMPRESSION_ZP_VALUE); + #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 + #if FILTER_LOAD_BLOCK_SIZE % DECOMPRESSION_ZP_GROUP_SIZE != 0 + #error "FC bf_tiled kernel: Not support DECOMPRESSION_ZP_GROUPS_NUM > 1" + #endif + + const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE; + const uint offset_ofm = out_f + fi*SIMD + sglid; + const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE; + const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + + (offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; + wei_zp[fi] = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]); + #else + wei_zp[fi] = TO_ACCUMULATOR_TYPE(d_zps[fi % DECOMPRESSION_ZP_LENGTH]); + #endif + #else + wei_zp[fi] = ACCUMULATOR_VAL_ZERO; + #endif + } + #endif + #if FILTER_LOAD_BLOCK_SIZE == 2 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); #elif FILTER_LOAD_BLOCK_SIZE == 4 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); - DQ_SLM_FILTER_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; - char_slm_weight[wei_local_idx+1] = as_int(wei_2); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); + SLM_WEIGHT_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; + char_slm_weight[wei_local_idx+1] = as_uint(wei_2); #elif FILTER_LOAD_BLOCK_SIZE == 8 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); - DQ_SLM_FILTER_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; - char_slm_weight[wei_local_idx+1] = as_int(wei_2); - DQ_SLM_FILTER_VEC wei_3 = {dq_wei_unpacked.s89, dq_wei_unpacked.sab}; - char_slm_weight[wei_local_idx+2] = as_int(wei_3); - DQ_SLM_FILTER_VEC wei_4 = {dq_wei_unpacked.scd, dq_wei_unpacked.sef}; - char_slm_weight[wei_local_idx+3] = as_int(wei_4); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); + SLM_WEIGHT_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; + char_slm_weight[wei_local_idx+1] = as_uint(wei_2); + SLM_WEIGHT_VEC wei_3 = {dq_wei_unpacked.s89, dq_wei_unpacked.sab}; + char_slm_weight[wei_local_idx+2] = as_uint(wei_3); + SLM_WEIGHT_VEC wei_4 = {dq_wei_unpacked.scd, dq_wei_unpacked.sef}; + char_slm_weight[wei_local_idx+3] = as_uint(wei_4); #else #error "FC bf_tiled kernel: unsupported FILTER_LOAD_BLOCK_SIZE for SLM kernel" #endif wei_local_idx += SIMD * (FILTER_LOAD_BLOCK_SIZE/2); - weights_idx += SIMD * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #if COMPRESSED_WEIGHTS_INT8 + weights_idx += SIMD * TILE_K_OFM_PACKED; + #else + weights_idx += SIMD * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #endif } wei_local_idx = sglid * 2; @@ -1014,11 +1098,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif // Compute input * weight : packed char4 type - char8 weight = vload8(0, (__local char *)(&char_slm_weight[wei_local_idx + 16*2*ki])); - char4 first_weight = weight.s0123; - char4 second_weight = weight.s4567; + WEIGHT_VEC_TYPE weight = vload8(0, (__local SLM_WEIGHT_TYPE *)(&char_slm_weight[wei_local_idx + 16*2*ki])); + SLM_WEIGHT_VEC first_weight = weight.s0123; + SLM_WEIGHT_VEC second_weight = weight.s4567; unroll_for (uint bi = 0; bi < TILE_B; ++bi) { - char4 input_val = as_char4(_sub_group_shuffle(packed_in_0[bi / 2], (bi % 2) * 8 + ki)); + MAKE_DQ_TYPE_VEC(4) input_val = AS_DQ_TYPE_4(_sub_group_shuffle(packed_in_0[bi / 2], (bi % 2) * 8 + ki)); acc_tmp[0][bi] = imad_SW(acc_tmp[0][bi], input_val, first_weight); acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight); } @@ -1038,7 +1122,12 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #if COMPRESSED_WEIGHTS_INT8 + ACCUM_DQ_TYPE modified_calc_buff = ((int *)(&acc_tmp[fi]))[bi] - ((float)(wei_zp[fi]) * (convert_float)(activation_sum[bi])); + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += (convert_half)(convert_float(modified_calc_buff) * (float)ds * (float)de_quantize_scale[bi]); + #else + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #endif acc_tmp[fi][bi] = 0; } } @@ -1060,7 +1149,12 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #if COMPRESSED_WEIGHTS_INT8 + ACCUM_DQ_TYPE modified_calc_buff = ((int *)(&acc_tmp[fi]))[bi] - ((float)(wei_zp[fi]) * (convert_float)(activation_sum[bi])); + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += (convert_half)(convert_float(modified_calc_buff) * (float)ds * (float)de_quantize_scale[bi]); + #else + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #endif acc_tmp[fi][bi] = 0; } } @@ -1169,13 +1263,13 @@ KERNEL(fc)( , FUSED_OPS_DECLS #endif #if DYNAMIC_QUANTIZE - , __global char* quantized_input - , __global INPUT0_TYPE* de_quan_scale + , __global DQ_TYPE* quantized_input + , __global INPUT0_TYPE* quan_var #endif ) { #if USE_SLM #if DYNAMIC_QUANTIZE - __local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; + __local uint dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; #else __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD]; #endif @@ -1321,7 +1415,7 @@ KERNEL(fc)( OPTIONAL_SHAPE_INFO_TENSOR input, quantized_input, - de_quan_scale, + quan_var, #if DECOMPRESSION_SCALE_TERM decompression_scale, #endif @@ -1368,7 +1462,7 @@ KERNEL(fc)( OPTIONAL_SHAPE_INFO_TENSOR input, quantized_input, - de_quan_scale, + quan_var, #if DECOMPRESSION_SCALE_TERM decompression_scale, #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl index 68d778475f5601..99ff124e3a39f9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl @@ -59,6 +59,13 @@ inline char4 unpack_to_char(uint4x4_t v) __attribute__((overloadable)) { return (char4)(v0.s0, v0.s1, v1.s0, v1.s1); } +inline uchar4 unpack_to_uchar(uint4x4_t v) __attribute__((overloadable)) { + uchar2 v0 = unpack_to_uchar(v.s0); + uchar2 v1 = unpack_to_uchar(v.s1); + return (uchar4)(v0.s0, v0.s1, v1.s0, v1.s1); +} + + inline char4 unpack_transposed_to_char(int4x4_t v) __attribute__((overloadable)) { char2 v0 = unpack_to_char(v.s0); char2 v1 = unpack_to_char(v.s1); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 6604def1a69093..178e1ea405b6bb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -8,6 +8,7 @@ #include #include "common_types.h" +static constexpr size_t lws_batches = 8; static constexpr size_t simd = 16; static constexpr size_t min_quantize_grp_size = 32; static constexpr size_t min_slm_size = 256; @@ -50,6 +51,17 @@ static std::pair get_output_aligned_bf_size(const fully_connecte return {output_b, output_f}; } +static bool is_weight_dyn_quantizable(const fully_connected_params& params) { + auto weight_type = params.weights.GetDType(); + if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4) + return true; + // UINT8 weight type is supported by FC dyn-quantize(with SLM). + if (weight_type == WeightsType::UINT8) + return true; + + return false; +} + // DYNAMIC_QUANTIZE static size_t get_dynamic_quantize_group_size(const fully_connected_params& params) { auto dynamic_quantization_group_size = params.dynamic_quantization_group_size; @@ -91,7 +103,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para return 0; } -static bool should_dynamic_quantize(const fully_connected_params& params) { +static bool should_dynamic_quantize(const fully_connected_params& params, bool print_log = false) { size_t dynamic_quantization_group_size = get_dynamic_quantize_group_size(params); if (params.inputs[0].GetFirstElementOffset() != 0) @@ -110,11 +122,17 @@ static bool should_dynamic_quantize(const fully_connected_params& params) { const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) && (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) && - params.inputs[0].GetDType() == Datatype::F16 && - (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4)) { - GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size " << scale_group_size << ", Input (" << - kernel_selector::toString(params.inputs[0].GetDType()) << ", " << kernel_selector::toString(params.outputs[0].GetLayout()) << - ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << ", Y: " << params.inputs[0].Y().v << std ::endl; + params.inputs[0].GetDType() == Datatype::F16 && is_weight_dyn_quantizable(params)) { + if (print_log) { + GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size << + ", Dyn-quan group size: " << dynamic_quantization_group_size << + ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) << + ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) << + ", W:" << kernel_selector::toString(params.weights.GetDType()) << + "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) << + ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << + ", Y: " << params.inputs[0].Y().v << std ::endl; + } return true; } @@ -204,8 +222,9 @@ DeviceFeaturesKey FullyConnected_bf_tiled::get_required_device_features_key(cons } bool FullyConnected_bf_tiled::Validate(const Params& params) const { - if (!Parent::Validate(params)) + if (!Parent::Validate(params)) { return false; + } auto& fc_params = static_cast(params); auto& input = fc_params.inputs[0]; @@ -314,21 +333,21 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, if (tparams.tile_ofm * simd > 64) return false; - bool is_i4_u4 = (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4); + bool is_dyn_quantable_type = is_weight_dyn_quantizable(params); if (tparams.kernel_type == FullyConnected_bf_tiled::KernelType::SLM) { const auto required_batch_alignment = 64; if (!params.is_shape_agnostic && (!IsAligned(output_b, required_batch_alignment) || output_b < min_slm_size)) return false; const auto required_tile_b = 8; - if ((tparams.tile_b != required_tile_b) && !is_i4_u4) + if ((tparams.tile_b != required_tile_b) && !is_dyn_quantable_type) return false; const auto required_tile_ofm = 2; if (tparams.tile_ofm != required_tile_ofm) return false; - if (params.weights.GetDType() != WeightsType::INT4 && params.weights.GetDType() != WeightsType::UINT4) + if (!is_dyn_quantable_type) return false; if (params.engineInfo.deviceType != dev_type::integrated_gpu) @@ -340,7 +359,7 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, return true; } - if (params.compressed && is_i4_u4) { + if (params.compressed && is_dyn_quantable_type) { if (!(tparams.tile_ofm == 2 || tparams.tile_ofm == 4)) return false; if (tparams.tile_ofm == 4 && tparams.outer_ofm == 2 && !is_suitable_outer_ofm(params, output_f)) @@ -382,11 +401,10 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, while (max_tile_ofm * 2 * simd <= output_f && max_tile_ofm < 4) max_tile_ofm *= 2; - if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) { + if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4 || + (is_weight_dyn_quantizable(params) && should_dynamic_quantize(params))) { + // Only 4bit weight type is fully optimized to use SLM. In default kernel, SLM is not applied to 8bit weight. if (!params.is_shape_agnostic && batch == 1) { - if (should_dynamic_quantize(params)) - return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); - // Tuning for Meteor Lake if (is_weight_vertical(params, output_f)) { if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { @@ -411,9 +429,11 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, selector.Case(tune_params(16, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) .Case(tune_params(16, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } + selector.Case(tune_params(8, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) .Case(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) return selector.Default(tune_params(8, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT)); else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) @@ -501,7 +521,6 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au auto batch_threads = threads.first; auto feature_threads = threads.second; - const size_t lws_batches = 8; const size_t aligned_batch = Align(batch_threads, lws_batches); // Each WG calculates 8x8 batches (TILE_B x LWS[2] size) const bool can_use_slm = tparams.kernel_type == KernelType::SLM; @@ -550,7 +569,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { tile_k_ofm_packed /= 2; - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", weights_dt, tile_k_ofm)); const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance @@ -581,7 +599,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para if (dispatchData.use_slm) { OPENVINO_ASSERT(dispatchData.tile_n == 2, "[GPU] Unsupported TILE_OFM size for SLM kernel configuration"); - OPENVINO_ASSERT(weights_dt == WeightsType::INT4 || weights_dt == WeightsType::UINT4, "[GPU] Unsupported FC weights type for SLM kernel configuration"); + OPENVINO_ASSERT(is_weight_dyn_quantizable(params), "[GPU] Unsupported FC weights type for SLM kernel configuration"); auto lws_batches = dispatchData.lws[2]; auto total_weights_elements = simd * dispatchData.tile_n * simd * dispatchData.tile_mk; // SIMD * TILE_OFM * SIMD * TILE_IFM @@ -608,15 +626,19 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("LWS_BATCHES", lws_batches)); jit.AddConstant(MakeJitConstant("FILTER_LOAD_ITERS", weights_load_iters)); - if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { - jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); - } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { - jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + if (weights_dt == WeightsType::INT4 || weights_dt == WeightsType::UINT4) { + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + } else { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load)); + } } else { jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load)); } jit.AddConstant(MakeJitConstant("FILTER_LOAD_BLOCK_SIZE", block_read_size)); @@ -629,7 +651,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para if (should_dynamic_quantize(params)) { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1)); jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1)); - jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { if (add_decompress_scale_post_op) @@ -637,6 +658,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); } + jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second)); jit.AddConstant(MakeJitConstant("SIMD", simd)); @@ -659,9 +681,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para } auto max_tile_b_size = dispatchData.tile_m; - if (params.compressed && - params.is_shape_agnostic && - (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4)) + if (params.compressed && params.is_shape_agnostic && is_weight_dyn_quantizable(params)) max_tile_b_size = std::max(max_tile_b_size, (uint32_t)8); jit.Merge(MakeConstantLoopUnrollJitConstants(max_tile_b_size)); @@ -772,8 +792,10 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const { if (kd.internalBufferSizes[0] < input_size) { kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(input_size); // quantized input is char type - kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); // de_quan_scale is half type + // quantized input is char type + kd.internalBufferSizes.push_back(input_size); + // half type of de_quan_scale and activation sum for each quantized group + kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * 2 * 2); } kd.kernels[0].params.workGroups.global = {std::max((input_size / quantize_grp_size), (size_t)1), 1, 1}; @@ -800,7 +822,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) && is_weight_horizontal(fc_params, output_f)) { - // Large N + Small K case (horizontal weight) to use [osv64_isv2] + TILE_OFM 4 for batch 1 + // Large N + small K case (horizontal weight) to use [osv64_isv2] + TILE_OFM 4 for batch 1 weights_layout = WeightsLayout::os_is_yx_osv64_isv2; } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) @@ -947,6 +969,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, auto& quan_kernel = kd.kernels[0]; DispatchData dyn_quan_dispatch = dispatchData; auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second); + if (!params.is_shape_agnostic) + input_size = std::max(input_size, Align(get_input_bf_size(fc_params).first, lws_batches) * get_input_bf_size(fc_params).second); dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1}; dyn_quan_dispatch.lws = {16, 1, 1}; quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws; @@ -958,7 +982,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_cldnn_jit.AddConstant(MakeJitConstant("FC_KERNEL_DYNAMIC_QUANTIZE", 1)); auto quan_jit = CreateJit(kernelName, quan_cldnn_jit, quan_entry_point); - FillCLKernelData(quan_kernel, dyn_quan_dispatch, params.engineInfo, @@ -977,8 +1000,10 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + // char type quantized input kd.internalBufferSizes.push_back(input_size); - kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); + // half type of de_quan_scale and activation sum for each quantized group + kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2 * 2); kernel_number++; } kd.internalBufferDataType = Datatype::F16; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index eed9760348f669..6bf44a31add0f4 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1589,7 +1589,7 @@ class fully_connected_gpu_tests: public ::testing::Test { count++; OPENVINO_ASSERT(abs_diff < 256); } - GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + std::cout << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; } void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) { @@ -2903,7 +2903,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); - ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); config.set_property(ov::hint::dynamic_quantization_group_size(0)); @@ -2965,11 +2965,137 @@ class fully_connected_gpu_tests: public ::testing::Test { max_diff = abs_diff; avg += abs_diff; count++; - OPENVINO_ASSERT(abs_diff < 5); + OPENVINO_ASSERT(abs_diff < 6); } GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; OPENVINO_ASSERT((avg/count) < 0.5); } + + void test_compressed_int8_scale_dyn_quan_weight_u8(bool is_dynamic, int batch = 1, int ifm = 512, int ofm = 2048, + int quantize_group_size = 32, int scales_group_size = 128, + bool is_wzp_test = false, bool is_wzp_scalar = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + if (engine.get_device_info().dev_type == device_type::discrete_gpu) + GTEST_SKIP(); + + long int batch_num = batch; + long int ifm_num = ifm; + long int ofm_num = ofm; + long int wzp_num = is_wzp_scalar ? 1 : ofm_num; + + auto input_ps = ov::PartialShape{ batch_num, 1, ifm_num }; + auto input_mem = engine.allocate_memory({ input_ps, data_types::f16, format::bfyx }); + + auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u8, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx }); + auto dcomp_zp_mem = engine.allocate_memory({ {wzp_num, 1}, data_types::u8, format::bfyx }); + + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.f, 2.f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num, 0, 4); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.f, 2.f); + set_values(scale_mem, scale_data); + + if (is_wzp_test) { + auto zp_data = rg.generate_random_1d(wzp_num, 0, 2); + set_values(dcomp_zp_mem, zp_data); + } + + auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1, -1, -1 }, data_types::f16, format::bfyx } + : layout{ input_ps, data_types::f16, format::bfyx }; + + auto dcomp_zp_name = is_wzp_test ? "wzp" : ""; + auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 3, 2); + + if (is_wzp_test) { + fc_prim.compressed_weights = true; + fc_prim.decompression_zero_point = is_wzp_test ? "wzp" : ""; + } + + // Implemented dynamic quantize kernel + auto get_ref_results = [&]() { + topology topo; + topo.add(input_layout("input", in_layout)); + topo.add(data("weights", weights_mem)); + topo.add(data("scale", scale_mem)); + topo.add(data("wzp", dcomp_zp_mem)); + topo.add(fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topo, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("wzp", dcomp_zp_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false); + + if (is_dynamic && !engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto kernel_num = (is_dynamic) ? 3 : 2; + kernel_num = (quantize_group_size < 32) ? 2 : kernel_num; + ASSERT_EQ(impl->get_kernels().size(), size_t(kernel_num)); + } + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + size_t count = 0; + float max_diff = 0.f; + float avg = 0.f; + for (size_t i = 0; i < output_ptr_ref.size(); ++i) { + auto abs_diff = std::abs((float)output_ptr_ref[i] - (float)output_ptr[i]); + if (max_diff < abs_diff) + max_diff = abs_diff; + avg += abs_diff; + count++; + OPENVINO_ASSERT(abs_diff < 8); + } + GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + OPENVINO_ASSERT((avg/count) < 0.8); + } }; using shared_dims = std::tuple; @@ -4064,6 +4190,34 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_sta this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true); } +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 128, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_1024) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_2048) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 2048, 2048, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_4096) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_large_unaligned) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 310, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_small) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 16, 1024, 1024, 128, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_single) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 1, 1024, 1024, 128, 128, true); +} + TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { this->test_compressed_scale_bias(false); } From 9ec63be1be3c3d5005367fcd708c629456982bac Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 31 Oct 2024 05:29:24 +0400 Subject: [PATCH 117/233] [GPU] Fix per-token dynamic quantization (#27332) ### Details: - Allow the DynamicQuantizeKernelOpt kernel to be selected with the default scales order - Relax DynamicQuantizeKernelRef kernel validation function --- .../dynamic_quantize/dynamic_quantize_kernel_opt.cpp | 9 +++++++-- .../dynamic_quantize/dynamic_quantize_kernel_ref.cpp | 11 +++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index b610ac2076def4..52a648679499f2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -160,8 +160,13 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (dq_params.group_sizes.back() != UINT64_MAX) return false; - if (!dq_params.scales_output_order.empty()) - return false; + // Allow only default scales order + const auto& scales_output_order = dq_params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) + if (scales_output_order[i] != i) + return false; + } return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index b7a9b40191da4e..bd3d0f87cdc931 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -54,7 +54,9 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); - const auto& group_sizes = params.group_sizes; + auto group_sizes = params.group_sizes; + group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); + for (size_t i = 0; i < group_sizes.size(); i++) { jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); } @@ -68,7 +70,8 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - const auto& group_sizes = params.group_sizes; + auto group_sizes = params.group_sizes; + group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; @@ -134,10 +137,6 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; - const auto& prim_params = static_cast(params); - if (prim_params.group_sizes.size() != 4) - return false; - return true; } } // namespace kernel_selector From f60b9c41446594bda517eb86b8eb99d244d7318b Mon Sep 17 00:00:00 2001 From: Wenjing Kang Date: Thu, 31 Oct 2024 13:27:54 +0800 Subject: [PATCH 118/233] Add /MT[d] to CMAKE_LANG_FLAGS_CONFIG_INIT (#27173) ### Details: Add /MT[d] to CMAKE_LANG_FLAGS_CONFIG_INIT to avoid the missing of /O2 /Ob2 /DNDEBUG flags in CMAKE_LANG_FLAGS_CONFIG ### Tickets: - *152927* Signed-off-by: Kang Wenjing --- cmake/toolchains/mt.runtime.win32.toolchain.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/toolchains/mt.runtime.win32.toolchain.cmake b/cmake/toolchains/mt.runtime.win32.toolchain.cmake index 7dd4e1e7f96ded..9a99781eac0426 100644 --- a/cmake/toolchains/mt.runtime.win32.toolchain.cmake +++ b/cmake/toolchains/mt.runtime.win32.toolchain.cmake @@ -27,6 +27,11 @@ if(use_static_runtime) foreach(build_type "" "_DEBUG" "_MINSIZEREL" "_RELEASE" "_RELWITHDEBINFO") set(flag_var "CMAKE_${lang}_FLAGS${build_type}_INIT") string(REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + if (build_type STREQUAL "_DEBUG") + set(${flag_var} "${${flag_var}} /MTd") + else() + set(${flag_var} "${${flag_var}} /MT") + endif() endforeach() endforeach() endif() From 689e04320819f0784b40de633da4908b65579415 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 31 Oct 2024 09:54:16 +0400 Subject: [PATCH 119/233] [GPU] Add subsequent reshapes optimization and dynamic paddings support for RoPE and PagedAttention (#27329) ### Details: - Added subsequent reshapes optimization - Added dynamic paddings support for RoPE and PagedAttention ### Tickets: - [CVS-156124](https://jira.devtools.intel.com/browse/CVS-156124) --- .../graph_optimizer/prepare_buffer_fusing.cpp | 60 ++++++++--- .../src/graph/include/reshape_inst.h | 15 +-- .../cl_kernels/pa_kv_cache_update_ref.cl | 47 +++++---- .../kernel_selector/cl_kernels/rope_ref.cl | 30 +++--- .../kernel_selector/cl_kernels/sdpa_opt.cl | 31 ++++-- .../kernels/rope/rope_kernel_base.cpp | 25 +---- .../optimize_subsequent_reshapes.cpp | 99 +++++++++++++++++++ .../optimize_subsequent_reshapes.hpp | 23 +++++ .../src/plugin/transformations_pipeline.cpp | 3 + .../optimize_subsequent_reshapes_test.cpp | 97 ++++++++++++++++++ 10 files changed, 341 insertions(+), 89 deletions(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 6d7d609d232947..e94714c84fdebf 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -660,23 +660,34 @@ void crop_in_place_optimization::update_in_place_crop_padding_simple_data_format if (user_info.first && user_info.first->is_type()) { auto reshape_desc = user_info.first->as().get_primitive(); auto reshape_mode = reshape_desc->mode; + auto reshape_axis = crop_axis; if (reshape_mode == reshape::reshape_mode::base) { - user_info.second.data_padding._dynamic_dims_mask = dyn_pad_sizes; + auto reshape_ps = user_info.second.get_partial_shape(); + auto crop_dim_val = crop_layout.get_partial_shape()[crop_axis].get_length(); + + auto mul = 1; + reshape_axis = reshape_ps.size() - 1; + for (size_t i = reshape_ps.size(); i > 1; i--) { + if (reshape_ps[i - 1].is_dynamic() || mul == crop_dim_val) + break; + + mul *= reshape_ps[i - 1].get_length(); + reshape_axis = i - 1; + } } else if (reshape_mode == reshape::reshape_mode::unsqueeze || reshape_mode == reshape::reshape_mode::squeeze) { auto reshape_ps = user_info.second.get_partial_shape(); auto output_pattern = reshape_desc->output_pattern; - auto reshape_axis = crop_axis; for (size_t i = 0; i < output_pattern.size(); i++) { if (output_pattern[i] <= static_cast(reshape_axis)) { reshape_axis += reshape_mode == reshape::reshape_mode::unsqueeze ? 1 : -1; } } - - padding::DynamicDimsMask dyn_pad_mask; - dyn_pad_mask[reshape_axis] = 1; - user_info.second.data_padding._dynamic_dims_mask = dyn_pad_mask; } + + auto reshape_dyn_pad_mask = padding::DynamicDimsMask(); + reshape_dyn_pad_mask[reshape_axis] = 1; + user_info.second.data_padding._dynamic_dims_mask = reshape_dyn_pad_mask; } return; } @@ -704,13 +715,36 @@ void crop_in_place_optimization::update_in_place_crop_padding_simple_data_format auto reshape_desc = user_info.first->as().get_primitive(); auto reshape_mode = reshape_desc->mode; if (reshape_mode == reshape::reshape_mode::base) { - auto reshape_rank = user_info.second.get_partial_shape().size(); - auto reshape_last_dim = user_info.second.get_partial_shape().to_shape()[reshape_rank - 1]; - if (lower_sizes[crop_axis]) - lower_sizes[crop_axis] /= reshape_last_dim; - if (upper_sizes[crop_axis]) - upper_sizes[crop_axis] /= reshape_last_dim; - user_info.second.data_padding = padding(lower_sizes, upper_sizes, dyn_pad_sizes); + auto reshape_ps = user_info.second.get_partial_shape(); + auto crop_dim_val = crop_layout.get_partial_shape()[crop_axis].get_length(); + + auto divider = 1; + auto reshape_axis = reshape_ps.size(); + for (size_t i = reshape_ps.size(); i > 1; i--) { + const auto& dim_value = reshape_ps[i - 1].get_length(); + if (divider * dim_value == crop_dim_val) + break; + + divider *= dim_value; + reshape_axis = i - 1; + } + reshape_axis -= 1; + + const auto output_rank = std::max(reshape_ps.size(), static_cast(4)); + std::vector reshape_lower_sizes(output_rank, 0); + std::vector reshape_upper_sizes(output_rank, 0); + padding::DynamicDimsMask reshape_dyn_pad_mask; + + reshape_lower_sizes[reshape_axis] = lower_sizes[crop_axis]; + reshape_upper_sizes[reshape_axis] = upper_sizes[crop_axis]; + reshape_dyn_pad_mask[reshape_axis] = 1; + + if (reshape_lower_sizes[reshape_axis]) + reshape_lower_sizes[reshape_axis] /= divider; + if (reshape_upper_sizes[reshape_axis]) + reshape_upper_sizes[reshape_axis] /= divider; + + user_info.second.data_padding = padding(reshape_lower_sizes, reshape_upper_sizes, reshape_dyn_pad_mask); } else { auto reshape_ps = user_info.second.get_partial_shape(); auto output_pattern = reshape_desc->output_pattern; diff --git a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h index 1bbfd94256a50c..d6a71c20fcac8d 100644 --- a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h @@ -59,7 +59,7 @@ struct typed_program_node : public typed_program_node_base { return false; // TODO: If user is RoPE or MVN and dynamic padding exists, ouput padding propagation is not supported in the base mode - if (get_users().size() == 1 && (get_users().front()->is_type() || get_users().front()->is_type())) + if (get_users().size() == 1 && get_users().front()->is_type()) return false; auto axis = input().as().get_primitive()->axis; @@ -73,14 +73,17 @@ struct typed_program_node : public typed_program_node_base { const auto& output_pshape = prim->output_partial_shape; // TODO: If the reshape's output shape is non constant, issue occurs // during shape inference due to execution order at runtime - if ((output_pshape.size() != input_rank + 1) || prim->output_pattern.empty()) + if (prim->output_pattern.empty()) return false; + // Iteratively check the total product of all static innermost dimensions + // until the crop dimension value matches or the first dynamic dimension is encountered int64_t mul = 1; - for (size_t i = input_rank - 1; i < output_pshape.size() ; i++) { - if (output_pshape[i].is_dynamic()) - return false; - mul *= output_pshape[i].get_length(); + for (size_t i = output_pshape.size(); i > 1 ; i--) { + if (output_pshape[i - 1].is_dynamic() || mul == input_last_dim_val) + break; + + mul *= output_pshape[i - 1].get_length(); } if (input_last_dim_val != mul) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl index ef2f78496b2cf2..8426baf719f990 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl @@ -34,10 +34,14 @@ KERNEL(pa_kv_cache_update)( const uint seq_block_idx = block_indices_begins[seq_idx] + seq_len / PAGED_ATTENTION_BLOCK_SIZE; const uint block_idx = block_indices[seq_block_idx]; - uint key_value_in_offset = seq_idx * KV_HEADS_NUM * HEAD_SIZE + head_idx * HEAD_SIZE; + uint key_in_offset = INPUT0_OFFSET + + seq_idx * (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM) + + head_idx * HEAD_SIZE; + uint value_in_offset = INPUT1_OFFSET + + seq_idx * (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM) + + head_idx * HEAD_SIZE; uint key_out_offset = block_idx * KV_HEADS_NUM * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + head_idx * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + current_token_pos_in_block; - uint value_out_offset = block_idx * KV_HEADS_NUM * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + head_idx * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + current_token_pos_in_block * HEAD_SIZE; #define READ_BLOCK_SIZE GENERATE_STAGE_BLOCK_SIZE @@ -45,7 +49,7 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; @@ -56,7 +60,7 @@ KERNEL(pa_kv_cache_update)( #endif } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -83,8 +87,13 @@ KERNEL(pa_kv_cache_update)( const uint token_start_pos = (past_len + block_start_pos - subsequence_begin_idx) % PAGED_ATTENTION_BLOCK_SIZE; - uint key_value_in_offset = block_start_pos * KV_HEADS_NUM * HEAD_SIZE + - head_idx * HEAD_SIZE; + uint key_in_offset = INPUT0_OFFSET + + block_start_pos * (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM) + + head_idx * HEAD_SIZE; + + uint value_in_offset = INPUT1_OFFSET + + block_start_pos * (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM) + + head_idx * HEAD_SIZE; const uint current_block_idx = (past_len + block_start_pos - subsequence_begin_idx) / PAGED_ATTENTION_BLOCK_SIZE; @@ -106,14 +115,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -126,14 +135,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -146,14 +155,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -166,14 +175,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -181,7 +190,8 @@ KERNEL(pa_kv_cache_update)( } } - key_value_in_offset += KV_HEADS_NUM * HEAD_SIZE; + key_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM); + value_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM); key_out_offset += 1; value_out_offset += HEAD_SIZE; } @@ -194,14 +204,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -209,7 +219,8 @@ KERNEL(pa_kv_cache_update)( } } - key_value_in_offset += KV_HEADS_NUM * HEAD_SIZE; + key_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM); + value_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM); key_out_offset += 1; value_out_offset += HEAD_SIZE; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl index 38066b4461def4..133440a21301f2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl @@ -28,14 +28,11 @@ KERNEL(rope_ref)( uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 : 0; uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 : 0; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, p, b, h * HEAD_SIZE, 0); - - input_idx += SLICED_FROM_START * (p * INPUT0_FEATURE_NUM + b + 1) - + SLICED_FROM_END * (p * INPUT0_FEATURE_NUM + b); -#else uint input_idx = INPUT0_GET_INDEX(p, b, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; #endif + uint cos_sin_p = p < INPUT1_BATCH_NUM ? p : 0; uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0; uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0); @@ -69,14 +66,11 @@ KERNEL(rope_ref)( const uint h = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, b, p, h * HEAD_SIZE, 0); - - input_idx += SLICED_FROM_START * (b * INPUT0_FEATURE_NUM + p + 1) - + SLICED_FROM_END * (b * INPUT0_FEATURE_NUM + p); -#else uint input_idx = INPUT0_GET_INDEX(b, p, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; #endif + uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; uint cos_sin_p = p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM < INPUT1_FEATURE_NUM ? p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM : 0; uint cos_sin_h = h < INPUT1_SIZE_Y ? h : 0; @@ -119,15 +113,13 @@ KERNEL(rope_ref)( const uint p = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, b, h, p, 0); - - input_idx += SLICED_FROM_START * (b * INPUT0_FEATURE_NUM + h + 1) - + SLICED_FROM_END * (b * INPUT0_FEATURE_NUM + h); -#elif ENABLE_TRANSPOSE - uint input_idx = GET_DATA_INDEX(TRANSPOSED_INPUT0, b, h, p, 0); +#if ENABLE_TRANSPOSE + uint input_idx = INPUT0_GET_INDEX(b, p, h, 0); #else uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; +#endif #endif uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index c114332f393c0e..55f87e4189d9fe 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -885,9 +885,10 @@ KERNEL(sdpa_opt)( #if IS_PAGED_ATTENTION const uint block_start_pos = blocked_indexes_start[target_seq_dim]; const uint block_end_pos = blocked_indexes_end[target_seq_dim]; - - uint query_offset = block_start_pos * HEAD_SIZE * NUM_HEADS + num_heads_dim * HEAD_SIZE + head_size_idx; - const uint query_pitch = HEAD_SIZE * NUM_HEADS; + uint query_offset = INPUT0_OFFSET + + block_start_pos * (HEAD_SIZE * NUM_HEADS + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM) + + num_heads_dim * HEAD_SIZE + head_size_idx; + const uint query_pitch = (HEAD_SIZE * NUM_HEADS + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM); const uint cur_target_seq_len_size = block_end_pos - block_start_pos; #else @@ -996,8 +997,11 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif #define KEY_SEQ_OFFSET subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]] - uint key_offset = KEY_SEQ_OFFSET * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + seq_len * HEAD_SIZE * NUM_KV_HEADS; - const uint key_pitch = HEAD_SIZE * NUM_KV_HEADS; + const uint key_pitch = (HEAD_SIZE * NUM_KV_HEADS + INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM); + uint key_offset = INPUT1_OFFSET + + KEY_SEQ_OFFSET * key_pitch + + heads_dim * HEAD_SIZE + + seq_len * key_pitch; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + sglid, 0)]; @@ -1225,7 +1229,7 @@ KERNEL(sdpa_opt)( // QK*V calculation MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) acc_output_res = OUTPUT_VAL_ZERO; #if IS_PAGED_ATTENTION - const uint value_pitch = HEAD_SIZE * NUM_KV_HEADS; + const uint value_pitch = (HEAD_SIZE * NUM_KV_HEADS + INPUT2_PAD_BEFORE_FEATURE_NUM + INPUT2_PAD_AFTER_FEATURE_NUM); #else #ifdef INPUT2_DIMS_ORDER uint value_offset_base = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0); @@ -1246,7 +1250,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + (seq_len)) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + (seq_len)) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE)]; @@ -1311,7 +1318,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + (seq_len * SUBGROUP_SIZE)) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + (seq_len * SUBGROUP_SIZE)) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; @@ -1379,7 +1389,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + seq_len_leftovers_start) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + seq_len_leftovers_start) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE)]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp index a48632f6c45509..130c5a69d4262c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp @@ -29,38 +29,15 @@ JitConstants RoPEKernelBase::GetJitConstants(const rope_params& params, RoPEKern if (params.slice_stop > params.slice_start) { jit.AddConstant(MakeJitConstant("ENABLE_SLICE", true)); - - auto f = toCodeString(params.inputs[0].Feature(), 1); - auto x = toCodeString(params.inputs[0].X(), 2); - auto y = toCodeString(params.inputs[0].Y(), 3); - - auto sliced_val = toCodeString(params.slice_stop - params.slice_start); - auto sliced_x = params.axis == 3 ? sliced_val : x; - auto sliced_y = params.axis == 2 ? sliced_val : y; - - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_X_PITCH", 1)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_Y_PITCH", sliced_x)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_FEATURE_PITCH", sliced_x + "*" + sliced_y)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_BATCH_PITCH", sliced_x + "*" + sliced_y + "*" + f)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_OFFSET", 0)); jit.AddConstant(MakeJitConstant("SLICED_FROM_START", toCodeString(params.slice_start))); - if (params.axis == 2) { - jit.AddConstant(MakeJitConstant("SLICED_FROM_END", "(" + y + "-" + toCodeString(params.slice_stop) + ")")); - } else if (params.axis == 3) { - jit.AddConstant(MakeJitConstant("SLICED_FROM_END", "(" + x + "-" + toCodeString(params.slice_stop) + ")")); - } else { + if (params.axis != 2 && params.axis != 3) { OPENVINO_THROW("[GPU] Invalid axis value for RoPE operation"); } } if (params.transposed_input) { jit.AddConstant(MakeJitConstant("ENABLE_TRANSPOSE", true)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_OFFSET", 0)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_X_PITCH", 1)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_Y_PITCH", "INPUT0_FEATURE_PITCH")); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_FEATURE_PITCH", "INPUT0_Y_PITCH")); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_BATCH_PITCH", "INPUT0_BATCH_PITCH")); } if (!params.is_chatglm && (params.inputs[1].has_dynamic_pad() || params.inputs[2].has_dynamic_pad())) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp new file mode 100644 index 00000000000000..b87600ed36e347 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "optimize_subsequent_reshapes.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +OptimizeSubsequentReshapes::OptimizeSubsequentReshapes() { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + + auto dynamic_batch_only = [](Output output) { + const auto& shape = output.get_partial_shape(); + + if (shape.rank().is_dynamic()) + return false; + + if (shape.size() <= 1) + return false; + + if (shape[0].is_static()) + return false; + + for (size_t i = 1; i < shape.size(); i++) + if (shape[i].is_dynamic()) + return false; + + return true; + }; + + auto first_reshape_data = any_input(dynamic_batch_only); + auto first_reshape_pattern = ov::pass::pattern::wrap_type(); + auto first_reshape = wrap_type({ first_reshape_data, first_reshape_pattern }, + ov::pass::pattern::all_of({ dynamic_batch_only, ov::pass::pattern::consumers_count(1) })); + + auto second_reshape_pattern = ov::pass::pattern::wrap_type(); + auto second_reshape = wrap_type({ first_reshape, second_reshape_pattern }, dynamic_batch_only); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto input_node = pattern_map.at(first_reshape_data).get_node_shared_ptr(); + auto first_reshape_node = pattern_map.at(first_reshape).get_node_shared_ptr(); + auto second_reshape_node = pattern_map.at(second_reshape).get_node_shared_ptr(); + + auto input_ps = first_reshape_node->input(0).get_partial_shape(); + auto first_reshape_ps = first_reshape_node->get_output_partial_shape(0); + auto second_reshape_ps = second_reshape_node->get_output_partial_shape(0); + + auto static_dims_product = [](ov::PartialShape& ps) { + int64_t total_dims = 1; + + for (auto& dim : ps) { + if (dim.is_static()) + total_dims *= dim.get_length(); + } + + return total_dims; + }; + + if (static_dims_product(input_ps) != static_dims_product(first_reshape_ps) || + static_dims_product(first_reshape_ps) != static_dims_product(second_reshape_ps)) + return false; + + std::vector new_pattern; + for (auto& dim : second_reshape_ps) { + if (dim.is_dynamic()) { + new_pattern.push_back(0); + } else { + new_pattern.push_back(dim.get_length()); + } + } + + auto new_pattern_const = std::make_shared(ov::element::i32, ov::Shape{new_pattern.size()}, new_pattern); + auto new_reshape = std::make_shared(first_reshape_node->input(0).get_source_output(), new_pattern_const, true); + new_reshape->set_friendly_name(second_reshape_node->get_friendly_name()); + + ov::replace_node(second_reshape_node, new_reshape); + copy_runtime_info(first_reshape_node, new_reshape); + + return true; + }; + + auto m = std::make_shared(second_reshape, "OptimizeSubsequentReshapes"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp new file mode 100644 index 00000000000000..3a38bb92ad5167 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +/** + * @brief This pass looks for `Reshape [ dynamic dim, n static dims] -> Reshape [dynamic dim, n static dims]` patterns + * and replaces them with a single `Reshape [dynamic dim, n static dims]` operation. + */ +class OptimizeSubsequentReshapes : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("OptimizeSubsequentReshapes", "0"); + OptimizeSubsequentReshapes(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index a33a15fbbe6a1a..bfc348d135a813 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -76,6 +76,7 @@ #include "plugin/transformations/increase_position_ids_precision.hpp" #include "plugin/transformations/group_norm_composition.hpp" #include "plugin/transformations/dynamic_quantize_fully_connected.hpp" +#include "plugin/transformations/optimize_subsequent_reshapes.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" @@ -875,6 +876,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); + manager.register_pass(); + manager.register_pass(); // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp new file mode 100644 index 00000000000000..732a14be03bf39 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +#include "openvino/pass/manager.hpp" +#include "openvino/core/model.hpp" +#include "openvino/core/coordinate_diff.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" + +#include +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes1) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 0, 32, 128 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, -1 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, 4096 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes2) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 0, 32, 128 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 32, 1, 0 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 32, 1, 128 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes3) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 32, 1, 128 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 1, 32, 0 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, -1 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 32, 1, 128 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, 4096 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} From 2e87aad94024f310a4d2338482fe1bb1167d3c2a Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 31 Oct 2024 10:34:40 +0400 Subject: [PATCH 120/233] [GPU] Handling the case where get_state was called before set_state (#27276) ### Tickets: - *[156193](https://jira.devtools.intel.com/browse/CVS-156193)* --- src/plugins/intel_gpu/src/plugin/variable_state.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index 6b1c8d0cfc993f..2b7a26ba35a292 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -123,6 +123,12 @@ ov::element::Type VariableState::get_user_specified_type() const { } ov::SoPtr VariableState::get_state() const { + if (m_memory == nullptr) { + const auto& pshape = m_layout.get_partial_shape(); + const auto& shape = get_tensor_shape(pshape); + return m_context->create_host_tensor(get_user_specified_type(), shape); + } + auto tensor = m_context->create_host_tensor(get_user_specified_type(), m_memory->get_layout().get_shape()); convert_and_copy(m_memory, tensor._ptr.get(), m_context->get_engine().get_service_stream()); From a6a113ce01a49c8259c709a312df9e0f49eac970 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 31 Oct 2024 10:35:01 +0400 Subject: [PATCH 121/233] [GPU] Use onednn impl for dynamic gemm (#27212) ### Details: - *Performance improvement for LoRA* --- .../src/graph/impls/registry/gemm_impls.cpp | 5 ++- .../tests/unit/fusions/gemm_fusion_test.cpp | 4 +-- .../tests/unit/test_cases/gemm_gpu_test.cpp | 33 +++++++++++++------ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp index 66947ef1a84a00..436a3bb257b483 100644 --- a/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp @@ -19,7 +19,10 @@ const std::vector>& Registry static const std::vector> impls = { OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::GemmImplementationManager, shape_types::static_shape) OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::static_shape) - OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::dynamic_shape) + OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::dynamic_shape, + [](const program_node& node) { + return !node.can_use(impl_types::onednn); + }) }; return impls; diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp index baed5400181130..659ccaf9d8a723 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp @@ -455,8 +455,8 @@ TEST_P(gemm_2in_dynamic_add, add) { } INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_dynamic_add, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::batch, eltwise_mode::sum }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::feature, eltwise_mode::sum }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 4, "gemm_tiled_opt", broadcast_kinds::batch, eltwise_mode::sum }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 4, "gemm_tiled_opt", broadcast_kinds::feature, eltwise_mode::sum }, })); class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index 3b41f44050e527..df493544624b64 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -473,6 +473,9 @@ class gemm_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::optimize_data(true)); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, "", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm_ref", gemm_impl} })); + network network(engine, topology, config); network.set_input_data("input1", input1_mem); network.set_input_data("input2", input2_mem); @@ -498,6 +501,10 @@ class gemm_gpu_tests: public ::testing::Test { ExecutionConfig config = get_test_default_config(engine); config.set_property(ov::intel_gpu::optimize_data(true)); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, "", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} })); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); network->set_input_data("input1", input1_mem); network->set_input_data("input2", input2_mem); @@ -1246,10 +1253,12 @@ class gemm_gpu_tests: public ::testing::Test { network->set_input_data("input0", input0_mem); network->set_input_data("input1", input1_mem); - auto inst = network->get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + if (!engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + } auto outputs = network->execute(); @@ -1533,10 +1542,12 @@ class gemm_gpu_tests: public ::testing::Test { network->set_input_data("input0", input0_mem); network->set_input_data("input1", input1_mem); - auto inst = network->get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + if (!engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + } auto outputs = network->execute(); @@ -2853,8 +2864,10 @@ class gemm_onednn: public ::testing::Test { auto inst = network->get_primitive("gemm"); auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic()); + if (!engine.get_device_info().supports_immad) { + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + } auto outputs = network->execute(); From 9f6826ad4058dca14b3065f9448f2aaf0bd76f07 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 07:35:47 +0100 Subject: [PATCH 122/233] Bump paddlepaddle from 2.6.0 to 2.6.2 in /src/bindings/python (#26966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [paddlepaddle](https://github.com/paddlepaddle/paddle) from 2.6.0 to 2.6.2.
Release notes

Sourced from paddlepaddle's releases.

PaddlePaddle 2.6.1 Release Note

发版说明

此版本在新功能方面,引入了对Fake GroupWise Quant的支持,有助于用户更好地进行模型量化。同时新增了图神经网络训练引擎PGLBox,支持超大规模图模型GPU多机多卡高效训练。此外,增加了对自定义设备的支持,进一步扩展PaddlePaddle的功能范围。在bug方面,解决了一些核心功能、数据加载以及网络通信等方面的问题。修复了多个安全问题,包括一些潜在的安全漏洞,进一步提高框架代码安全性,并更新了安全公告。

新特性

  • 支持Fake GroupWise量化(#61900:新增对一种量化方法的支持,可以提高模型的性能效率。
  • 支持图神经网络GPU训练(#60495#62111:新增图神经网络训练引擎PGLBox,支持超大规模图模型GPU多机多卡高效训练。
  • 其他改进:支持tile op int8模式推理,并添加vlog语句 (#60261),repeat_interleave支持bfloat16数据类型的Tensor输入 (#61854),自定义设备支持动态图模式的c_embedding算子 (#60774),在CINN(自定义中间网络)框架中,将IntrinsicOps添加到ir_codes_collector中。

Bug修复

  • 修复权重量化内核错误(#60184:解决了在权重量化内核中当n不能被64整除时的问题。
  • 修复量化感知测试问题(#61211:修复了量化感知训练(QAT)测试中的问题,以确保其正常运行。
  • 修复Paddle-TRT集成问题(#61806, #61605, #61966:对Paddle-TRT集成进行了多项修复,包括缓存键值(KV)量化和单元测试失败问题。
  • 禁用LLM_INT8 UT(#62282:禁用了大型语言模型(LLM)INT8精度的单元测试,以避免不必要的运行时。
  • 修复test_benchmark单测编译失败问题(#61427:修复了test_benchmark单测编译失败的问题#60092
  • 修复工具包的数据加载器(#61867:对工具包的数据加载器进行了必要的更正。
  • 修复put_along_axis系列问题(#62065:对reduce参数新增支持min/max/mean三种可选值,修复了reduce=add/mul下的反向梯度计算Bug, 修复了reduce=mul下的GPU前向计算Bug,修复了size过大时的前向计算Bug。
  • 修复Windows平台下的编译Bug(#60308:修复windows平台下的编译找不到common库的bug。
  • 修复OpenSSL-CPU编译错误(#62079:修复cpu-openblas编译场景中未正确链接 Python 库导致的编译bug。

安全修复

文档

  • 文档风格改进(#61688:改进了文档的样式和格式。
  • 安全公告更新(#60532, #60649:更新了2023年的安全公告,以通知用户潜在的安全问题。
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=paddlepaddle&package-manager=pip&previous-version=2.6.0&new-version=2.6.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/bindings/python/constraints.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt index c7837798c8aca7..a0fbf982105ad6 100644 --- a/src/bindings/python/constraints.txt +++ b/src/bindings/python/constraints.txt @@ -18,7 +18,7 @@ patchelf<=0.17.2.1 # Frontends h5py>=3.1.0,<3.13.0 docopt~=0.6.2 -paddlepaddle==2.6.0 +paddlepaddle==2.6.2 tensorflow>=1.15.5,<2.18.0 six~=1.16.0 protobuf>=3.18.1,<4.0.0 From 2f78f6f9ca0f9da93de1751948621513b50e57fa Mon Sep 17 00:00:00 2001 From: Anastasia Kuporosova Date: Thu, 31 Oct 2024 09:04:54 +0100 Subject: [PATCH 123/233] [PyOV] allow constant accept keyword args (#27346) ### Details: - using of singledispatch breaks the scenario where a user pass keywords arguments only ### Tickets: - CVS-153553 --- src/bindings/python/src/openvino/runtime/opset13/ops.py | 8 ++++---- src/bindings/python/tests/test_graph/test_constant.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/opset13/ops.py b/src/bindings/python/src/openvino/runtime/opset13/ops.py index cb201d3d4263dd..a624ffb4f79873 100644 --- a/src/bindings/python/src/openvino/runtime/opset13/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset13/ops.py @@ -15,7 +15,7 @@ from openvino.runtime.op import Constant, Result from openvino.runtime.opset1 import convert_like from openvino.runtime.opset_utils import _get_node_factory -from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op, overloading from openvino.runtime.utils.types import ( NumericData, NodeInput, @@ -271,7 +271,7 @@ def scaled_dot_product_attention( return _get_node_factory_opset13().create("ScaledDotProductAttention", inputs, attributes) -@singledispatch +@overloading(Union[NumericData, np.number, bool, np.bool_, list], Union[NumericType, Type], Optional[str], bool) # type: ignore @nameable_op def constant( value: Union[NumericData, np.number, bool, np.bool_, list], @@ -339,9 +339,9 @@ def display_shared_memory_warning(warning_message: str) -> None: return Constant(_value, shared_memory=_shared_memory) -@constant.register +@overloading(Tensor, bool, Optional[str]) # type: ignore @nameable_op -def _( +def constant( # noqa: F811 tensor: Tensor, shared_memory: bool = False, name: Optional[str] = None, diff --git a/src/bindings/python/tests/test_graph/test_constant.py b/src/bindings/python/tests/test_graph/test_constant.py index e28a4ad05510f2..131654855b380a 100644 --- a/src/bindings/python/tests/test_graph/test_constant.py +++ b/src/bindings/python/tests/test_graph/test_constant.py @@ -87,7 +87,7 @@ def test_init_with_array(src_dtype, dst_dtype, shared_flag, data_getter): data = np.ascontiguousarray(data) # Create constant from based on numpy dtype or openvino type - ov_const = ops.constant(data, dtype=dst_dtype, shared_memory=shared_flag) + ov_const = ops.constant(data, dst_dtype, shared_memory=shared_flag) # Check shape and element type of Constant class assert isinstance(ov_const, Constant) @@ -842,7 +842,7 @@ def test_get_data_casting_bf16(src_dtype, dst_dtype, copy_flag): ) def test_get_data_casting_packed(src_dtype, ov_type, dst_dtype, copy_flag): data = np.array([[0, 0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1]], dtype=src_dtype) - ov_const = ops.constant(data, dtype=ov_type) + ov_const = ops.constant(value=data, dtype=ov_type) arr = ov_const.get_data(dtype=dst_dtype, copy=copy_flag) if dst_dtype is None: @@ -867,7 +867,7 @@ def test_const_from_tensor(shared_flag): shape = [1, 3, 32, 32] arr = np.ones(shape).astype(np.float32) ov_tensor = Tensor(arr, shape, Type.f32) - ov_const = ops.constant(ov_tensor, shared_memory=shared_flag) + ov_const = ops.constant(tensor=ov_tensor, shared_memory=shared_flag) assert isinstance(ov_const, Constant) assert np.all(list(ov_const.shape) == shape) From 66b14678502d1aada20d4d6357157b3dff2adcf8 Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Thu, 31 Oct 2024 08:32:49 +0000 Subject: [PATCH 124/233] NPUW: Eliminate unnecessary kvcache tensors copy (#27347) ### Details: - We mistakenly copy input parameters when we shouldn't - Yet another `||` -> `&&` change, hopefully less destructive this time ### Tickets: - *ticket-id* --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 26363e66e55d2a..0e0b96582a663c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -597,7 +597,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { LOG_BLOCK(); if (!is_spatial_param(sub_in_idx)) { // Input parameter is non-spatial, do normal handling - if (do_copy || m_input_allocated.count(g_tnsr->data()) == 0) { + if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { LOG_DEBUG("Will be copied"); copy_list.emplace_back(g_tnsr, s_port); } else { From 44b86a860ecb0a3e79e6f75627d6cc5270226e7a Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 31 Oct 2024 12:54:52 +0400 Subject: [PATCH 125/233] benchmark_app/cpp: report an error if no files were found. (#26663) Python version already reports an error in that case. benchmark_app is the only user of `readInputFilesArguments()`. It could make sense earlier to emit the warning instead of the error because other samples. Ticket 152614 --- samples/cpp/common/utils/src/args_helper.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/cpp/common/utils/src/args_helper.cpp b/samples/cpp/common/utils/src/args_helper.cpp index f4a3d10ceb0b5b..ba58f98e498e90 100644 --- a/samples/cpp/common/utils/src/args_helper.cpp +++ b/samples/cpp/common/utils/src/args_helper.cpp @@ -29,8 +29,7 @@ void readInputFilesArguments(std::vector& files, const std::string& arg) { struct stat sb; if (stat(arg.c_str(), &sb) != 0) { - slog::warn << "File " << arg << " cannot be opened!" << slog::endl; - return; + throw std::invalid_argument(arg + " file or directory not found."); } if (S_ISDIR(sb.st_mode)) { struct CloseDir { @@ -43,17 +42,20 @@ void readInputFilesArguments(std::vector& files, const std::string& using Dir = std::unique_ptr; Dir dp(opendir(arg.c_str())); if (dp == nullptr) { - slog::warn << "Directory " << arg << " cannot be opened!" << slog::endl; - return; + throw std::invalid_argument(arg + " directory cannot be opened!"); } struct dirent* ep; + size_t files_size = files.size(); while (nullptr != (ep = readdir(dp.get()))) { std::string fileName = ep->d_name; if (fileName == "." || fileName == "..") continue; files.push_back(arg + "/" + ep->d_name); } + if (files.size() == files_size) { + throw std::invalid_argument("No files were found in directory " + arg); + } } else { files.push_back(arg); } From 86083e0dbf8d173451a8ee47fa40496a62aea893 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Thu, 31 Oct 2024 10:23:59 +0100 Subject: [PATCH 126/233] [Transformations] Add Squeeze-15 downgrade transformation (#27286) ### Details: - *Add Squeeze-15 downgrade transformation to Squeeze-0 for compatible attribute* - *...* ### Tickets: - *CVS-154027* ### PR requires [PR-26995](https://github.com/openvinotoolkit/openvino/pull/26995) to be merged --------- Co-authored-by: Michal Lukaszewski --- .../convert_squeeze15_downgrade.hpp | 23 ++++ .../common_optimizations.cpp | 2 + .../convert_squeeze15_downgrade.cpp | 40 +++++++ .../convert_squeeze15_downgrade_test.cpp | 112 ++++++++++++++++++ 4 files changed, 177 insertions(+) create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp create mode 100644 src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp diff --git a/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp b/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp new file mode 100644 index 00000000000000..c2ebfbc0f3138b --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { +/** + * @ingroup ov_transformation_common_api + * @brief Converts Squeeze v15 to Squeeze v0. + */ +class TRANSFORMATIONS_API ConvertSqueeze15ToSqueeze0 : public MatcherPass { +public: + OPENVINO_RTTI("ConvertSqueeze15ToSqueeze0", "0"); + ConvertSqueeze15ToSqueeze0(); +}; + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp index 9d46b583a828f2..37ee2d12d9aebb 100644 --- a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp @@ -98,6 +98,7 @@ #include "transformations/op_conversions/convert_softmax_downgrade.hpp" #include "transformations/op_conversions/convert_softmax_upgrade.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" #include "transformations/op_conversions/convert_subtract.hpp" #include "transformations/op_conversions/convert_topk11_downgrade.hpp" #include "transformations/op_conversions/convert_xor_to_logical_xor.hpp" @@ -235,6 +236,7 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr(); ADD_MATCHER(fq_fusions, FakeQuantizeMulFusion) diff --git a/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp b/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp new file mode 100644 index 00000000000000..50701d3d6acd56 --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertSqueeze15ToSqueeze0::ConvertSqueeze15ToSqueeze0() { + MATCHER_SCOPE(ConvertSqueeze15ToSqueeze0); + + const auto& squeeze_v15_pattern = pattern::wrap_type(); + + const matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { + const auto& squeeze_v15 = ov::as_type_ptr(m.get_match_root()); + if (!squeeze_v15 || transformation_callback(squeeze_v15)) { + return false; + } + std::shared_ptr squeeze_v0; + if (squeeze_v15->get_input_size() == 1) { + squeeze_v0 = std::make_shared(squeeze_v15->input_value(0)); + } else if (squeeze_v15->get_input_size() == 2 && !squeeze_v15->get_allow_axis_skip()) { + squeeze_v0 = std::make_shared(squeeze_v15->input_value(0), squeeze_v15->input_value(1)); + } else { + return false; + } + squeeze_v0->set_friendly_name(squeeze_v15->get_friendly_name()); + copy_runtime_info(squeeze_v15, squeeze_v0); + replace_node(squeeze_v15, squeeze_v0); + + return true; + }; + + auto m = std::make_shared(squeeze_v15_pattern, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp b/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp new file mode 100644 index 00000000000000..f3d90ab2c748bd --- /dev/null +++ b/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" + +#include + +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/opsets/opset1.hpp" +#include "openvino/opsets/opset15.hpp" +#include "openvino/pass/manager.hpp" +#include "transformations/utils/utils.hpp" +using namespace ov; +using namespace testing; + +namespace { + +enum class IndicesMode { NONE, CONST, PARAM }; + +std::shared_ptr create_v15_model(const IndicesMode indices_mode, + const std::vector indices_const_val, + const bool allow_axis_skip) { + const PartialShape data_shape{-1, {2, 5}, 1, {1, 5}, 4}; + const auto& data = std::make_shared(ov::element::f32, data_shape); + ov::ParameterVector params = {data}; + std::shared_ptr squeeze; + if (indices_mode == IndicesMode::NONE) { + squeeze = std::make_shared(data, allow_axis_skip); + } else if (indices_mode == IndicesMode::PARAM) { + const auto& indices = + std::make_shared(ov::element::i32, PartialShape({data_shape.rank()})); + params.push_back(indices); + squeeze = std::make_shared(data, indices, allow_axis_skip); + } else if (indices_mode == IndicesMode::CONST) { + const auto& indices = + ov::opset15::Constant::create(ov::element::i32, Shape({indices_const_val.size()}), indices_const_val); + squeeze = std::make_shared(data, indices, allow_axis_skip); + } + squeeze->set_friendly_name("squeeze15"); + return std::make_shared(squeeze->outputs(), params); +} + +std::shared_ptr create_v1_model(const IndicesMode indices_mode, const std::vector indices_const_val) { + const PartialShape data_shape{-1, {2, 5}, 1, {1, 5}, 4}; + const auto& data = std::make_shared(ov::element::f32, data_shape); + ov::ParameterVector params = {data}; + std::shared_ptr squeeze; + if (indices_mode == IndicesMode::NONE) { + squeeze = std::make_shared(data); + } else if (indices_mode == IndicesMode::PARAM) { + const auto& indices = + std::make_shared(ov::element::i32, PartialShape({data_shape.rank()})); + params.push_back(indices); + squeeze = std::make_shared(data, indices); + } else if (indices_mode == IndicesMode::CONST) { + const auto& indices = + ov::opset1::Constant::create(ov::element::i32, Shape({indices_const_val.size()}), indices_const_val); + squeeze = std::make_shared(data, indices); + } + squeeze->set_friendly_name("squeeze15"); + return std::make_shared(squeeze->outputs(), params); +} + +} // namespace + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_no_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::NONE, {}, false); + model_ref = create_v1_model(IndicesMode::NONE, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_no_indices_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::NONE, {}, true); + model_ref = create_v1_model(IndicesMode::NONE, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_const_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::CONST, {0, -4, 3}, false); + model_ref = create_v1_model(IndicesMode::CONST, {0, -4, 3}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_dynamic_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::PARAM, {}, false); + model_ref = create_v1_model(IndicesMode::PARAM, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_unsupported_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::PARAM, {}, true); +} From c685d44493f5a4b0403038f6f1ce9f350cfc0581 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 31 Oct 2024 13:24:38 +0400 Subject: [PATCH 127/233] [Snippets][CPU][Tests] Added tests for dynamic BF16/INT8 MHA (#27169) ### Details: - *Added more tests for the validation of INT8/BF16 MHA in CPU Plugin* - *Split the large "mha.cpp" file into the several small files with the same test semantic (comment https://github.com/openvinotoolkit/openvino/pull/26547#discussion_r1796616083)* ### Tickets: - *N/A* --- .../skip_tests_config.cpp | 8 +- .../snippets/matmul.cpp | 38 +- .../shared_tests_instances/snippets/mha.cpp | 543 +++--------------- .../snippets/mha_extracted_reshape.cpp | 40 ++ .../snippets/mha_fma.cpp | 33 ++ .../snippets/mha_quantized.cpp | 103 ++++ .../snippets/mha_select.cpp | 41 ++ .../snippets/mha_split_dim_m.cpp | 121 ++++ .../snippets/mha_transposed_b.cpp | 50 ++ .../snippets/mha_with_dyn_mul.cpp | 68 +++ .../snippets/mha_wo_transpose.cpp | 151 +++++ .../snippets/transpose_matmul.cpp | 32 +- .../shared_tests_instances/snippets/utils.hpp | 48 ++ .../plugin/shared/include/snippets/mha.hpp | 3 + .../plugin/shared/src/snippets/mha.cpp | 20 +- .../include/subgraph_mha.hpp | 15 +- .../ov_snippets_models/src/subgraph_mha.cpp | 113 ++-- 17 files changed, 807 insertions(+), 620 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 6edc4f062536d0..90820d550df179 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -237,7 +237,6 @@ std::vector disabledTestPatterns() { R"(.*smoke_FakeQuantize.*/FakeQuantizeLayerTest.Inference.*TS=.*3.4.2.5.*LEVELS=255.*)", R"(.*smoke_FakeQuantizePerChannel.*/FakeQuantizeLayerTest.Inference.*TS=.*11.10.22.19.*LEVELS=(255|256).*netPRC=f32.*)", R"(.*smoke_MVN_5D/Mvn6LayerTest.Inference.*TS=.*3.4.2.5.*LEVELS=255.*netPRC=f16.*)", - R"(.*smoke_Snippets_MHAINT8MatMul/MHAINT8MatMul.*)", R"(.*smoke_static/ConvertFqRnnToQuantizedRnn.*2.1.5.*2.1.1.*2.1.1.*)", R"(.*smoke_InterpolateBicubicPillow_Layout_Test/InterpolateLayerCPUTest.CompareWithRefs/ShapeCalcMode=sizes_IS=\[?.2..20.?.?\]_TS.*1.17.4.4.*2.3.10.12.*1.17.4.4.*Sizes.*4.4.*10.20.*10.4.*PARAMETER.*0.0.0.0.*0.0.1.1.*2.3.*)", R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/.*_netType=bf16.*)", @@ -563,7 +562,7 @@ std::vector disabledTestPatterns() { // ignored for not supported bf16 platforms retVector.emplace_back(R"(.*smoke_Snippets_EnforcePrecision_bf16.*)"); retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16.*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*EnforceBF16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)"); } // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems @@ -572,6 +571,11 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); retVector.emplace_back(R"(.*smoke_Snippets_MatMulTransposeB.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); retVector.emplace_back(R"(.*smoke_Snippets_MatMulBias.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + + retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16_3D.*IS\[1\]=\[2.64.\?\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[(\?|1).(\?|4).(\?|12).(\?|64)\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[\?.\?.\?\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_(MHAINT8MatMul|MHAQuantMatMul0|MHAFQAfterMatMul_4D|smoke_Snippets_MHAFQ).*IS\[0\]=\[\?.\?.\?\.\?].*)"); } #ifdef SNIPPETS_LIBXSMM_TPP // GN in TPP requires exposing tmp Buffer results outside the loop (ticket: 151234) diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index f5057137f9b65c..176f0cb4d46aed 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -4,44 +4,26 @@ #include "snippets/matmul.hpp" -#include "common_test_utils/test_constants.hpp" -#include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) - namespace { -static inline std::vector> quantized_precisions() { - std::vector> prc = {}; - // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms - if (ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8()) { - prc.emplace_back(std::vector{element::i8, element::i8}); - prc.emplace_back(std::vector{element::u8, element::i8}); - } - return prc; -} - static inline std::vector> precisions() { - std::vector> prc = { - {element::f32, element::f32}, - }; + std::vector> prc = precision_f32(2); // Note: TPP doesn't support low precisions yet #ifndef SNIPPETS_LIBXSMM_TPP - auto quant = quantized_precisions(); + auto quant = quantized_precisions_if_supported(); std::copy(quant.begin(), quant.end(), std::back_inserter(prc)); - // In Snippets MatMul BF16 is supported only on bf16/AMX platforms - if (ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16()) { - prc.emplace_back(std::vector{element::bf16, element::bf16}); - } + auto bfloat = precision_bf16_if_supported(2); + std::copy(bfloat.begin(), bfloat.end(), std::back_inserter(prc)); #endif return prc; } - std::vector> input_shapes{ { {{}, {{2, 1, 3, 5}}}, {{}, {{1, 3, 5, 3}}} }, { {{}, {{3, 1, 32, 14}}}, {{}, {{1, 3, 14, 37}}} }, @@ -158,7 +140,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBiasQuantized, MatMulBiasQuantized, ::testing::Combine( ::testing::ValuesIn(input_shapes_bias), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(1), // Subgraph ::testing::Values(1), // Tokenized MatMul+Bias @@ -167,8 +149,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBiasQuantized, MatMulBiasQuantized INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantized, MatMulsQuantized, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(3), // Subgraph + Reshape + Subgraph ::testing::Values(2), // Tokenized [MatMul+FQ+Matmul] and [FQ] @@ -177,8 +159,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantized, MatMulsQuantized, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantizedSoftmax, MatMulsQuantizedSoftmax, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(3), // Subgraph + Reshape + Subgraph ::testing::Values(2), // Tokenized [MatMul+FQ+Matmul] and [FQ] diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 79db0b1546b2a8..63f5176684ccc1 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -1,60 +1,70 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "snippets/mha.hpp" -#include "common_test_utils/test_constants.hpp" -#include "internal_properties.hpp" -#include "utils/cpu_test_utils.hpp" -#include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) namespace { -const auto& inputShapes_4D = STATIC_SHAPES( - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, - {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, - {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); - -const auto& inputShapes_3D = STATIC_SHAPES( - {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, - {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, {68, 6, 92}}, - {{16, 2, 92}, {68, 2, 92}, {1, 16, 68}, {68, 2, 92}}); - -static inline bool is_bf16_supported() { - return ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16(); -} - -static inline std::vector> precision_f32(size_t count) { - std::vector> prc; - prc.emplace_back(std::vector(count, element::f32)); - return prc; -} - -static inline std::vector> precision_bf16(size_t count) { - std::vector> prc; - if (is_bf16_supported()) - prc.emplace_back(std::vector(count, element::bf16)); - return prc; +std::vector> transposedShape_4D(bool with_dynamic = true) { + auto shapes = SNIPPETS_TESTS_STATIC_SHAPES( + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, + {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, + {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); + if (with_dynamic) { + std::vector> dynamic_shapes = {{ + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {2, 2, 16, 128}, {2, 1, 128, 128}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}}}, + }, + { + {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, + {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, + }}; + shapes.insert(shapes.end(), dynamic_shapes.begin(), dynamic_shapes.end()); + } + return shapes; } -static ov::AnyMap enable_callback() { - return ov::AnyMap({ov::intel_cpu::snippets_mode(ov::intel_cpu::SnippetsMode::ENABLE)}); +std::vector> transposedShape_3D(bool with_dynamic = true) { + auto shapes = SNIPPETS_TESTS_STATIC_SHAPES( + {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, {68, 6, 92}}, + {{16, 2, 92}, {68, 2, 92}, {1, 16, 68}, {68, 2, 92}}); + if (with_dynamic) { + shapes.push_back({ + {PartialShape{-1, -1, -1}, {{128, 3, 64}, {128, 3, 64}, {68, 6, 87}}}, + {PartialShape{-1, -1, -1}, {{128, 1, 64}, {128, 1, 64}, {13, 6, 87}}}, + {PartialShape{-1, -1, -1}, {{1, 128, 128}, {1, 128, 128}, {1, 68, 13}}}, + {PartialShape{-1, -1, -1}, {{128, 3, 64}, {128, 3, 64}, {13, 6, 87}}}, + }); + } + return shapes; } INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), @@ -62,27 +72,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); -std::vector> inputShapes_4D_dynamic{ - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, - }, - { - {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, - {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, - } -}; - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D_dynamic), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(false)), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false}), + ::testing::Values(true), ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), @@ -90,13 +85,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_3D), + ::testing::Combine(::testing::ValuesIn(transposedShape_3D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // [122706]: Subgraph + 4 Transpose ::testing::Values(2), // decomposed Transpose + MHA @@ -104,111 +98,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); -const auto& splitm_static_shapes = STATIC_SHAPES({{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_4D_SplitDimensionM_static, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_static_shapes), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(true), - ::testing::Values(4), // 4 Threads - ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(enable_callback())), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_3D_SplitDimensionM_static, - MHA, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}})), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(true), - ::testing::Values(4), // 4 Threads - ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes - ::testing::Values(1), // MHA - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(enable_callback())), - MHA::getTestCaseName); - -std::vector> splitm_dynamic_shapes_4d = { - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 1, 1, 128}, {1, 1, 1, 17}, {1, 1, 1, 128}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - }, - { - {PartialShape{-1, 128, -1, -1}, {{1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 128, -1}, {{1, 1, 128, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, - { - {PartialShape{-1, 32, -1, -1}, {{1, 32, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 32, -1}, {{1, 1, 32, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, - { - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 16, -1}, {{1, 1, 16, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_4D_SplitDimensionM_dynamic, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_4d), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), - ::testing::Values(4), // 4 Threads - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> splitm_dynamic_shapes_3d = { - { - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, -1, -1}, {{1, 1, 128}, {1, 1, 17}, {1, 1, 128}}}, - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - }, - { - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - {PartialShape{1, 1, -1}, {{1, 1, 128}, {1, 1, 64}, {1, 1, 128}}}, - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_3D_SplitDimensionM_dynamic, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_3d), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), - ::testing::Values(4), // 4 Threads - ::testing::Values(5), // Subgraph + 4 Transpose - ::testing::Values(2), // MHA + one of the transposes is executed via Subgraph (because callback is disabled) - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D_WithScalarMul, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_3D(false)), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // [122706]: Subgraph + 4 Transpose + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), - ::testing::ValuesIn(precision_bf16(4)), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), + ::testing::ValuesIn(precision_bf16_if_supported(4)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({false, true}), ::testing::Values(MHA::default_thread_count), @@ -220,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), @@ -231,321 +137,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), MHA::getTestCaseName); -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAMulAdd, - MHAMulAdd, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 10, 12, 16}, {1, 10, 12, 16}, {1, 10, 12, 16}})), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false}), // Need to support True for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapeSelect = STATIC_SHAPES( - // without broadcast - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}}, - {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}}, - // with broadcast - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}}, - {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}} -); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA, - MHASelect, - ::testing::Combine(::testing::ValuesIn(inputShapeSelect), - ::testing::ValuesIn(precision_f32(6)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // Need to support True for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(2), // Less + MHA - ::testing::Values(2), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapesWOTranspose_4D = STATIC_SHAPES( - {{1, 12, 197, 64}, {1, 12, 64, 197}, {1, 12, 197, 64}}, - {{1, 12, 12, 64}, {1, 12, 64, 48}, {1, 12, 48, 64}}); -const auto& inputShapesWOTranspose_3D = STATIC_SHAPES( - {{12, 197, 64}, {12, 64, 197}, {12, 197, 64}}, - {{12, 128, 100}, {12, 100, 128}, {12, 128, 100}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeOnInputs_4D, - MHAWOTransposeOnInputs, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(true), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTranspose_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTranspose_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapesWOTranspose_3D_dynamic{ - { - {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, - {PartialShape{-1, -1, -1}, {{1, 85, 19}, {2, 36, 40}}}, - {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, - }, - { - {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 2, 64}, {2, 9, 64}}}, - {PartialShape{2, 64, -1}, {{2, 64, 9}, {2, 64, 2}, {2, 64, 9}}}, - {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 2, 64}, {2, 9, 64}}}, - }, -}; - - - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_DynMHAWOTranspose_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D_dynamic), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeBF16_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_bf16(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeBF16_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_bf16(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeEnforceBF16_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::bf16), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeEnforceBF16_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::bf16), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAINT8MatMul, - MHAINT8MatMul, - ::testing::Combine(::testing::ValuesIn(std::vector>(inputShapes_4D.begin(), - inputShapes_4D.begin() + 2)), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul - ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAQuantMatMul0, - MHAQuantMatMul0, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 128, 768}, {1, 128, 768}, {1, 1, 1, 128}, {1, 128, 768}})), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(9), // FQx2 on inputs + MHA + Transpose on output + 4 Reshapes + Deq Mul - ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul_4D, - MHAFQAfterMatMul, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // MHA + Transpose on output + Deq Mul - ::testing::Values(2), // MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAFQ, - MHAFQ, - ::testing::Combine(::testing::ValuesIn(STATIC_SHAPES({{1, 64, 12, 64}, - {1, 64, 12, 64}, - {1, 1, 1, 64}, - {1, 64, 12, 64}})), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // Transposex2 + Subgraphsx5 - ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapesTransposedB { - { - {{}, {{1, 12, 12, 64}}}, - {{}, {{1, 12, 48, 64}}}, - {{}, {{1, 12, 48, 64}}} - }, - { - {PartialShape{-1, 3, -1, 64}, {{1, 3, 12, 64}, {2, 3, 36, 64}}}, - {PartialShape{-1, 3, -1, 64}, {{1, 3, 14, 64}, {2, 3, 42, 64}}}, - {PartialShape{-1, 3, -1, -1}, {{1, 3, 14, 36}, {2, 3, 42, 36}}}, - }, - { - {PartialShape{2, -1, 32, -1}, {{2, 1, 32, 70}, {2, 2, 32, 96}}}, - {PartialShape{2, -1, 49, -1}, {{2, 3, 49, 70}, {2, 1, 49, 96}}}, - {PartialShape{2, -1, 49, -1}, {{2, 1, 49, 17}, {2, 2, 49, 81}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHATransposedB, - MHATransposedB, - ::testing::Combine(::testing::ValuesIn(inputShapesTransposedB), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapesExtractedReshape = STATIC_SHAPES( - {{2, 196, 64}, {2, 64, 196}, {2, 14, 14, 14, 1}, {2, 14, 14, 1, 14}, {2, 196, 64}}, - {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 1}, {1, 4, 4, 1, 4}, {1, 16, 10}}, - {{1, 16, 10}, {1, 10, 16}, {1, 1, 1, 1, 1}, {1, 4, 4, 4, 4}, {1, 16, 10}}, - {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 16, 10}}, - {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 4, 256}, {1, 4, 256}, {1, 4, 16, 10}}, - {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 1, 256}, {1, 4, 1}, {1, 4, 16, 10}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWithExtractedReshape, - MHAWithExtractedReshape, - ::testing::Combine(::testing::ValuesIn(inputShapesExtractedReshape), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // False is not supported for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA - ::testing::Values(2), // Extracted Add + MHA - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapes_4D_WithMul_dynamic{ - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, - {PartialShape{1}, {{1}, {1}, {1}, {1} }}, - {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, - }, - { - {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, - {PartialShape{-1, 12, 64, -1}, {{1, 12, 64, 35}, {1, 12, 64, 10}, {1, 12, 64, 10}, {1, 12, 64, 1}, {1, 12, 64, 35}}}, - {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, - } -}; - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D_WithMul, - MHAWithDynamicMul, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D_WithMul_dynamic), - ::testing::ValuesIn(precision_f32(5)), - ::testing::Values(ov::element::f32), - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHAWithDynamicMul::getTestCaseName); - } // namespace } // namespace snippets } // namespace test diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp new file mode 100644 index 00000000000000..f3c1439395650a --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +const auto& inputShapesExtractedReshape = SNIPPETS_TESTS_STATIC_SHAPES( + {{2, 196, 64}, {2, 64, 196}, {2, 14, 14, 14, 1}, {2, 14, 14, 1, 14}, {2, 196, 64}}, + {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 1}, {1, 4, 4, 1, 4}, {1, 16, 10}}, + {{1, 16, 10}, {1, 10, 16}, {1, 1, 1, 1, 1}, {1, 4, 4, 4, 4}, {1, 16, 10}}, + {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 16, 10}}, + {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 4, 256}, {1, 4, 256}, {1, 4, 16, 10}}, + {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 1, 256}, {1, 4, 1}, {1, 4, 16, 10}}); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWithExtractedReshape, + MHAWithExtractedReshape, + ::testing::Combine(::testing::ValuesIn(inputShapesExtractedReshape), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({true}), // False is not supported for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA + ::testing::Values(2), // Extracted Add + MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp new file mode 100644 index 00000000000000..4bf35e2daa690d --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAMulAdd, + MHAMulAdd, + ::testing::Combine( + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 10, 12, 16}, {1, 10, 12, 16}, {1, 10, 12, 16}})), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({false}), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp new file mode 100644 index 00000000000000..0c731b74565863 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> inputShapesQuantized { + { + {{}, {{1, 128, 16, 64}}}, + {{}, {{1, 128, 16, 64}}}, + {{}, {{1, 16, 1, 1}}}, + {{}, {{1, 128, 16, 64}}} + }, + { + {{}, {{2, 68, 6, 92}}}, + {{}, {{2, 68, 6, 92}}}, + {{}, {{1, 1, 68, 68}}}, + {{}, {{2, 68, 6, 92}}} + }, + // K, N are static + { + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}, {1, 128, 12, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}, {1, 128, 12, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 1, 128}, {2, 1, 128, 128}, {1, 12, 1, 1}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}, {1, 128, 12, 600}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAINT8MatMul, + MHAINT8MatMul, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAQuantMatMul0, + MHAQuantMatMul0, + ::testing::Combine( + ::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // FQx2 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAFQAfterMatMul_4D, + MHAFQAfterMatMul, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), // MHA + Transpose on output + Deq Mul + ::testing::Values(2), // MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAFQ, + MHAFQ, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(7), // Transposex2 + Subgraphsx5 + ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp new file mode 100644 index 00000000000000..3fc1417d20b102 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +const auto& inputShapeSelect = SNIPPETS_TESTS_STATIC_SHAPES( + // without broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}}, + // with broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}}, + {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}} +); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA, + MHASelect, + ::testing::Combine(::testing::ValuesIn(inputShapeSelect), + ::testing::ValuesIn(precision_f32(6)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), // Less + MHA + ::testing::Values(2), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp new file mode 100644 index 00000000000000..bb5f7fe2fa5b52 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp @@ -0,0 +1,121 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +static ov::AnyMap enable_callback() { + return ov::AnyMap({ov::intel_cpu::snippets_mode(ov::intel_cpu::SnippetsMode::ENABLE)}); +} + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_SplitDimensionM_static, + MHA, + ::testing::Combine(::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}})), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_3D_SplitDimensionM_static, + MHA, + ::testing::Combine( + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}})), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes + ::testing::Values(1), // MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +std::vector> splitm_dynamic_shapes_4d = { + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 1, 1, 128}, {1, 1, 1, 17}, {1, 1, 1, 128}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + }, + { + {PartialShape{-1, 128, -1, -1}, {{1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 128, -1}, {{1, 1, 128, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, + { + {PartialShape{-1, 32, -1, -1}, {{1, 32, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 32, -1}, {{1, 1, 32, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 16, -1}, {{1, 1, 16, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_SplitDimensionM_dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_4d), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), + ::testing::Values(4), // 4 Threads + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +std::vector> splitm_dynamic_shapes_3d = { + { + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, -1, -1}, {{1, 1, 128}, {1, 1, 17}, {1, 1, 128}}}, + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + }, + { + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + {PartialShape{1, 1, -1}, {{1, 1, 128}, {1, 1, 64}, {1, 1, 128}}}, + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_3D_SplitDimensionM_dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_3d), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), + ::testing::Values(4), // 4 Threads + ::testing::Values(5), // Subgraph + 4 Transpose + ::testing::Values(2), // MHA + one of the transposes is executed via Subgraph (because callback is disabled) + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp new file mode 100644 index 00000000000000..45260df3cab280 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> inputShapesTransposedB { + { + {{}, {{1, 12, 12, 64}}}, + {{}, {{1, 12, 48, 64}}}, + {{}, {{1, 12, 48, 64}}} + }, + { + {PartialShape{-1, 3, -1, 64}, {{1, 3, 12, 64}, {2, 3, 36, 64}}}, + {PartialShape{-1, 3, -1, 64}, {{1, 3, 14, 64}, {2, 3, 42, 64}}}, + {PartialShape{-1, 3, -1, -1}, {{1, 3, 14, 36}, {2, 3, 42, 36}}}, + }, + { + {PartialShape{2, -1, 32, -1}, {{2, 1, 32, 70}, {2, 2, 32, 96}}}, + {PartialShape{2, -1, 49, -1}, {{2, 3, 49, 70}, {2, 1, 49, 96}}}, + {PartialShape{2, -1, 49, -1}, {{2, 1, 49, 17}, {2, 2, 49, 81}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHATransposedB, + MHATransposedB, + ::testing::Combine(::testing::ValuesIn(inputShapesTransposedB), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp new file mode 100644 index 00000000000000..7876d737af2281 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> transposedShape_4D_WithMul { + { + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 200, -1, 100}, {{1, 200, 4, 100}, {2, 200, 2, 100}, {1, 200, 4, 100}}}, + {PartialShape{-1, -1, 100, 200}, {{1, 4, 100, 200}, {2, 2, 100, 200}, {1, 4, 100, 200}}}, + {PartialShape{-1, -1, -1, 200}, {{1, 4, 64, 200}, {2, 2, 16, 200}, {1, 4, 72, 200}}}, + {PartialShape{-1, 200, -1, 100}, {{1, 200, 4, 100}, {2, 200, 2, 100}, {1, 200, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, + {PartialShape{1}, {{1}, {1}, {1}, {1} }}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, + }, + { + {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, + {PartialShape{-1, 12, 64, -1}, {{1, 12, 64, 35}, {1, 12, 64, 10}, {1, 12, 64, 10}, {1, 12, 64, 1}, {1, 12, 64, 35}}}, + {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_WithDynamicMul, + MHAWithDynamicMul, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D_WithMul), + ::testing::ValuesIn(precision_f32(5)), + ::testing::Values(ov::element::f32), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHAWithDynamicMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_WithDynamicMul_EnforceBF16, + MHAWithDynamicMul, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D_WithMul), + ::testing::ValuesIn(precision_f32(5)), + ::testing::Values(ov::element::bf16), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around + ::testing::Values(7), // MHA + 6 Converts around + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHAWithDynamicMul::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp new file mode 100644 index 00000000000000..0967ef27087674 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> originalShape_4D { + { {{}, {{1, 12, 197, 64}}}, {{}, {{1, 12, 64, 197}}}, {{}, {{1, 12, 197, 64}}} }, + { {{}, {{1, 12, 12, 64}}}, {{}, {{1, 12, 64, 48}}}, {{}, {{1, 12, 48, 64}}} }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 3, 128, 64}, {1, 12, 197, 100}, {1, 3, 128, 64}, {1, 12, 197, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 3, 64, 128}, {1, 12, 100, 197}, {1, 3, 64, 128}, {1, 12, 600, 197}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 3, 128, 64}, {1, 12, 197, 100}, {1, 3, 128, 64}, {1, 12, 197, 600}}}, + }, + { + {PartialShape{1, 4, -1, -1}, {{1, 4, 384, 64}, {1, 4, 197, 64}, {1, 4, 384, 560}}}, + {PartialShape{1, 4, -1, -1}, {{1, 4, 64, 128}, {1, 4, 64, 197}, {1, 4, 560, 384}}}, + {PartialShape{1, 4, -1, 64}, {{1, 4, 128, 64}, {1, 4, 197, 64}, {1, 4, 384, 64}}}, + } +}; + +std::vector> originalShape_3D { + { {{}, {{12, 197, 64}}}, {{}, {{12, 64, 197}}}, {{}, {{12, 197, 64}}} }, + { {{}, {{12, 128, 100}}}, {{}, {{12, 100, 128}}}, {{}, {{12, 128, 100}}} }, + { + {PartialShape{-1, -1, 64}, {{2, 9, 64}, {1, 64, 64}, {2, 64, 64}}}, + {PartialShape{-1, 64, 124}, {{2, 64, 124}, {1, 64, 124}, {2, 64, 124}}}, + {PartialShape{-1, 124, 64}, {{2, 124, 64}, {1, 124, 64}, {2, 124, 64}}}, + }, + { + {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, + {PartialShape{-1, -1, -1}, {{1, 85, 19}, {2, 36, 40}}}, + {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, + }, + { + {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 4, 64}, {2, 9, 64}}}, + {PartialShape{2, 64, -1}, {{2, 64, 9}, {2, 64, 4}, {2, 64, 9}}}, + {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 4, 64}, {2, 9, 64}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeOnInputs_4D, + MHAWOTransposeOnInputs, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTranspose_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTranspose_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeBF16_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_bf16_if_supported(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeBF16_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_bf16_if_supported(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeEnforceBF16_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::bf16), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeEnforceBF16_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::bf16), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp index c05087283305e4..ea7de9ccb209ad 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp @@ -6,36 +6,28 @@ #include "common_test_utils/test_constants.hpp" #include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) - namespace { static inline std::vector> precisions(bool only_fp32 = true) { - std::vector> prc = { - {element::f32, element::f32}, - }; -// Note: low precisions are not supported by TPP yet (ticker: 130010) + std::vector> prc = precision_f32(2); +// Note: TPP doesn't support low precisions yet #ifndef SNIPPETS_LIBXSMM_TPP if (!only_fp32) { - // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms - if (ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8()) { - prc.emplace_back(std::vector{element::i8, element::i8}); - prc.emplace_back(std::vector{element::u8, element::i8}); - } - // In Snippets MatMul BF16 is supported only on bf16/AMX platforms - if (ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16()) { - prc.emplace_back(std::vector{element::bf16, element::bf16}); - } + auto quant = quantized_precisions_if_supported(); + std::copy(quant.begin(), quant.end(), std::back_inserter(prc)); + auto bfloat = precision_bf16_if_supported(2); + std::copy(bfloat.begin(), bfloat.end(), std::back_inserter(prc)); } #endif return prc; } namespace transpose_zero_input { -const auto& transpose_input_shapes = STATIC_SHAPES({{1, 49, 2, 23}, {2, 2, 23, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{1, 49, 2, 23}, {2, 2, 23, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), @@ -84,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FullyConnected, TransposeMatMul, } // namespace transpose_zero_input namespace transpose_first_input { -const auto& transpose_input_shapes = STATIC_SHAPES({{2, 1, 49, 13}, {1, 13, 3, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{2, 1, 49, 13}, {1, 13, 3, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), @@ -126,7 +118,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulFQ, TransposeMatMulFQ, } // namespace transpose_first_input namespace transpose_output { -const auto& transpose_input_shapes = STATIC_SHAPES({{2, 1, 49, 13}, {1, 2, 13, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{2, 1, 49, 13}, {1, 2, 13, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( @@ -195,7 +187,7 @@ static inline std::vector> precisions(bool only_fp32 } INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}})), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}})), ::testing::Values(1), // Transpose on second input ::testing::ValuesIn(precisions()), ::testing::Values(MatMulType::MatMul), @@ -223,7 +215,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynExplicitTransposeMatMul, ExplicitTran INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}})), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}})), ::testing::Values(1), // Transpose on second input ::testing::ValuesIn(precisions()), ::testing::Values(MatMulType::MatMul), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp new file mode 100644 index 00000000000000..6c0d54da973086 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "internal_properties.hpp" +#include "utils/cpu_test_utils.hpp" +#include "openvino/runtime/system_conf.hpp" + +namespace ov { +namespace test { +namespace snippets { + +#define SNIPPETS_TESTS_STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) + +static inline bool is_bf16_supported_by_brgemm() { + return ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16(); +} + +static inline bool is_i8_supported_by_brgemm() { + return ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8(); +} + +static inline std::vector> precision_f32(size_t count) { + std::vector> prc; + prc.emplace_back(std::vector(count, element::f32)); + return prc; +} + +static inline std::vector> precision_bf16_if_supported(size_t count) { + std::vector> prc; + if (is_bf16_supported_by_brgemm()) + prc.emplace_back(std::vector(count, element::bf16)); + return prc; +} + +static inline std::vector> quantized_precisions_if_supported() { + std::vector> prc = {}; + // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms + if (is_i8_supported_by_brgemm()) { + prc.emplace_back(std::vector{element::i8, element::i8}); + prc.emplace_back(std::vector{element::u8, element::i8}); + } + return prc; +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp index f8198dee0218ee..34cb4d452bfb15 100644 --- a/src/tests/functional/plugin/shared/include/snippets/mha.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp @@ -44,6 +44,7 @@ class MHABase : virtual public SnippetsTestsCommon { void generate_inputs(const std::vector& targetInputStaticShapes) override; virtual std::shared_ptr get_subgraph() const = 0; virtual void init_params(std::vector& input_shapes, ov::element::Type& prc, ov::AnyMap& additional_config) = 0; + virtual void init_thresholds(); size_t m_thread_count; std::vector m_input_types; @@ -88,6 +89,7 @@ class MHATransposedB : public MHA { class MHAINT8MatMul : public MHA { protected: std::shared_ptr get_subgraph() const override; + void init_thresholds() override; }; class MHAQuantMatMul0 : public MHA { @@ -103,6 +105,7 @@ class MHAFQAfterMatMul : public MHA { class MHAFQ : public MHA { protected: std::shared_ptr get_subgraph() const override; + void init_thresholds() override; }; class MHAWithExtractedReshape : public MHA { diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp index 351cd50856357d..8d0cb8613bc47e 100644 --- a/src/tests/functional/plugin/shared/src/snippets/mha.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp @@ -53,15 +53,19 @@ void MHABase::SetUp() { configuration.insert({"SNIPPETS_MODE", "IGNORE_CALLBACK"}); } - setInferenceType(prc); inType = outType = prc; + setInferenceType(prc); + init_thresholds(); +} + + void MHABase::init_thresholds() { // Note: Libxsmm calculates Exp in a slightly different way, so the abs values might differ a bit. Ticket: 130699 #ifdef SNIPPETS_LIBXSMM_TPP abs_threshold = 1e-6; #endif - if (prc == ov::element::bf16) + if (inType == ov::element::bf16) rel_threshold = 0.05f; -} + } std::string MHA::getTestCaseName(testing::TestParamInfo obj) { std::vector input_shapes; @@ -194,6 +198,11 @@ std::shared_ptr MHAINT8MatMul::get_subgraph() const { return std::make_shared(inputDynamicShapes); } +void MHAINT8MatMul::init_thresholds() { + MHABase::init_thresholds(); + abs_threshold = 4e-6; +} + std::shared_ptr MHAQuantMatMul0::get_subgraph() const { return std::make_shared(inputDynamicShapes); } @@ -206,6 +215,11 @@ std::shared_ptr MHAFQ::get_subgraph() const { return std::make_shared(inputDynamicShapes); } +void MHAFQ::init_thresholds() { + MHABase::init_thresholds(); + abs_threshold = 0.016; +} + std::shared_ptr MHAMulAdd::get_subgraph() const { return std::make_shared(inputDynamicShapes); } diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp index 90ab47214effee..f54f92c598a45f 100644 --- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp +++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp @@ -235,9 +235,7 @@ class MHAWOTransposeSplitMFunction : public MHAWOTransposeFunction { * FakeQuantize i8 * \ / * Add - * Reshape0 - * Softmax - * Reshape1 Transpose2[0,2,1,3] + * Softmax Transpose2[0,2,1,3] * \ / * MatMul1 * FakeQuantize i8 @@ -261,9 +259,7 @@ class MHAFQAfterMatMulFunction : public SnippetsFunctionBase { * FakeQuantize i8 * \ / * Add - * Reshape0 - * Softmax - * Reshape1 FakeQuantize i8 + * Softmax FakeQuantize i8 * FakeQuantize u8 Transpose2[0,2,1,3] * \ / * MatMul1 @@ -281,20 +277,17 @@ class MHAINT8MatMulFunction : public SnippetsFunctionBase { }; /* Graph: - * FakeQuantize i8 Reshape1 - * Reshape0 Transpose1[0,2,3,1] + * FakeQuantize i8 Transpose1[0,2,3,1] * Transpose0[0,2,1,3] FakeQuantize i8 * \ / * MatMul0 * \ / - * Add Reshape2 + * Add * Softmax Transpose2[0,2,1,3] * \ / * MatMul1 * FakeQuantize i8 * Transpose3[0,2,1,3] - * Reshape3 - * Note: Reshapes are tosplit Tokenization between FQs and deq Mul and MHA since Snippets::Ignore_Callback may be enabled */ class MHAQuantMatMul0Function : public SnippetsFunctionBase { public: diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index 1dbf8d7d22ed26..34f42ec838aa6d 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -598,38 +598,25 @@ std::shared_ptr MHAFQAfterMatMulFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {reshape0ConstData.size()}, reshape0ConstData); - - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); auto fq0 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto add = std::make_shared(fq0, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto softMax = std::make_shared(add, -1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(softMax, transpose2, transA, transB); auto fq1 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto transpose3 = std::make_shared(fq1, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -642,46 +629,33 @@ std::shared_ptr MHAINT8MatMulFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {reshape0ConstData.size()}, reshape0ConstData); - - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); auto fq2 = ov::test::utils::make_fake_quantize(transpose2Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto transpose1 = std::make_shared(fq1, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); auto fq3 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto add = std::make_shared(fq3, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); - auto fq4 = ov::test::utils::make_fake_quantize(reshape1, ov::element::f32, 256, {1}, - {0}, {0.820726}, {0}, {0.820726}); + const auto softMax = std::make_shared(add, -1); + auto fq4 = ov::test::utils::make_fake_quantize(softMax, ov::element::f32, 256, {1}, + {0}, {0.820726}, {0}, {0.820726}); const auto transpose2 = std::make_shared(fq2, transpose2Const); const auto matMul1 = std::make_shared(fq4, transpose2, transA, transB); auto fq5 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto transpose3 = std::make_shared(fq5, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -694,34 +668,20 @@ std::shared_ptr MHAQuantMatMul0Function::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto channel = int64_t(12); - const auto last_dim = input_shapes[0].get_shape().back(); - OPENVINO_ASSERT(last_dim % channel == 0, "Incorrect test configuration"); - const auto new_shape = std::vector{0, 0, channel, static_cast(last_dim) / channel}; - - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape2Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape3Const = ov::op::v0::Constant::create(ov::element::i64, {input_shapes[0].size()}, std::vector{0, 0, -1}); - - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 3, 1}); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - - const auto reshape1 = std::make_shared(transpose1Param, reshape1Const, true); - const auto reshape2 = std::make_shared(transpose2Param, reshape2Const, true); + const auto shape_rank = input_shapes[0].size(); + auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); + auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - const auto transpose1 = std::make_shared(reshape1, transpose1Const); - const auto transpose2 = std::make_shared(reshape2, transpose2Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-12.5187311}, {12.4209289}, {-12.5187311}, {12.4209289}); + {-12.5187311}, {12.4209289}, {-12.5187311}, {12.4209289}); auto fq1 = ov::test::utils::make_fake_quantize(transpose1, ov::element::f32, 256, {1}, - {-1.43326699}, {1.42206954}, {-1.43326699}, {1.42206954}); - - const auto reshape0 = std::make_shared(fq0, reshape0Const, true); - const auto transpose0 = std::make_shared(reshape0, transpose0Const); + {-1.43326699}, {1.42206954}, {-1.43326699}, {1.42206954}); + const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto matMul0 = std::make_shared(transpose0, fq1); const auto add = std::make_shared(matMul0, addParam); @@ -729,11 +689,10 @@ std::shared_ptr MHAQuantMatMul0Function::initOriginal() const { const auto matMul1 = std::make_shared(softMax, transpose2); auto fq2 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-1.81826221}, {1.804057}, {-1.81826221}, {1.804057}); + {-1.81826221}, {1.804057}, {-1.81826221}, {1.804057}); const auto transpose3 = std::make_shared(fq2, transpose3Const); - const auto reshape3 = std::make_shared(transpose3, reshape3Const, true); - ov::ResultVector results{std::make_shared(reshape3)}; + ov::ResultVector results{std::make_shared(transpose3)}; return std::make_shared(results, ngraphParam, "mha"); } std::shared_ptr MHAFQFunction::initOriginal() const { @@ -743,18 +702,15 @@ std::shared_ptr MHAFQFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - const auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-5.217694}, {6.661877}, {-5.217694}, {6.661877}); - const auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, - {-6.40245}, {6.45286}, {-6.40245}, {6.45286}); - const auto fq_add = ov::test::utils::make_fake_quantize(addParam, ov::element::f32, 256, {1}, - {-1000}, {0}, {-1000}, {0}); + const auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, {-5.217694}, {6.661877}, {-5.217694}, {6.661877}); + const auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, {-6.40245}, {6.45286}, {-6.40245}, {6.45286}); + const auto fq_add = ov::test::utils::make_fake_quantize(addParam, ov::element::f32, 256, {1}, {-1000}, {0}, {-1000}, {0}); bool transA = false; bool transB = false; @@ -766,16 +722,13 @@ std::shared_ptr MHAFQFunction::initOriginal() const { const auto mul_deq_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, std::vector{0.00098425}); const auto mul_deq = std::make_shared(convert, mul_deq_const); const auto mul = std::make_shared(transpose1, mul_deq); - auto fq1_1 = ov::test::utils::make_fake_quantize(mul, ov::element::f32, 256, {1}, - {-0.8003067}, {0.8066083}, {-0.8003067}, {0.8066083}); + const auto fq1_1 = ov::test::utils::make_fake_quantize(mul, ov::element::f32, 256, {1}, {-0.8003067}, {0.8066083}, {-0.8003067}, {0.8066083}); const auto matMul0 = std::make_shared(transpose0, fq1_1, transA, transB); - auto fq2 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-14.50351}, {17.65645}, {-14.50351}, {17.65645}); + const auto fq2 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, {-14.50351}, {17.65645}, {-14.50351}, {17.65645}); const auto add = std::make_shared(fq2, fq_add); const auto softMax = std::make_shared(add, 3); const auto matMul1 = std::make_shared(softMax, transpose2, transA, transB); - auto fq3 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-1.895786}, {2.0028071}, {-1.895786}, {2.0028071}); + auto fq3 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, {-1.895786}, {2.0028071}, {-1.895786}, {2.0028071}); const auto transpose3 = std::make_shared(fq3, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; From f2640a2d7ee57432b66b085540709904c8525afb Mon Sep 17 00:00:00 2001 From: Wenjing Kang Date: Thu, 31 Oct 2024 17:41:31 +0800 Subject: [PATCH 128/233] Update CMAKE_LANG_FLAGS_CONFIG_INIT appending in toolchain to avoid flag repetition (#27352) ### Details: -Currently, when using this toolchain and print the following flags in [CMakeLists](https://github.com/openvinotoolkit/openvino/blob/master/CMakeLists.txt) ``` CMAKE_CXX_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_C_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_CXX_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG CMAKE_C_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG ``` So there is repetition of `/MT` in flags. The change in this PR will fix this problem. The flags will be: ``` CMAKE_CXX_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_C_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_CXX_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG CMAKE_C_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG ``` ### Tickets: - *152927* Signed-off-by: Kang Wenjing --- cmake/toolchains/mt.runtime.win32.toolchain.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/toolchains/mt.runtime.win32.toolchain.cmake b/cmake/toolchains/mt.runtime.win32.toolchain.cmake index 9a99781eac0426..b331d370bfe7bf 100644 --- a/cmake/toolchains/mt.runtime.win32.toolchain.cmake +++ b/cmake/toolchains/mt.runtime.win32.toolchain.cmake @@ -28,9 +28,9 @@ if(use_static_runtime) set(flag_var "CMAKE_${lang}_FLAGS${build_type}_INIT") string(REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") if (build_type STREQUAL "_DEBUG") - set(${flag_var} "${${flag_var}} /MTd") + set(${flag_var} "/MTd") else() - set(${flag_var} "${${flag_var}} /MT") + set(${flag_var} "/MT") endif() endforeach() endforeach() From 272843d81ad242f2622b8951d922baa299ccdfc1 Mon Sep 17 00:00:00 2001 From: Artemy Skrebkov Date: Thu, 31 Oct 2024 09:44:05 +0000 Subject: [PATCH 129/233] Add support for shape and data_shape parameters (#27314) ### Details: - Move helper function for reshaping to `npu_tools_utils` - Introduce `shape` and `data_shape` params ### Tickets: - E144161 --------- Signed-off-by: Skrebkov, Artemy --- .../tools/common/include/tools_helpers.hpp | 181 ++++++++++++++++++ .../tools/compile_tool/CMakeLists.txt | 3 +- .../intel_npu/tools/compile_tool/main.cpp | 109 +---------- .../tools/compile_tool/tools_helpers.hpp | 81 -------- .../tools/single-image-test/main.cpp | 132 +++++-------- 5 files changed, 236 insertions(+), 270 deletions(-) create mode 100644 src/plugins/intel_npu/tools/common/include/tools_helpers.hpp delete mode 100644 src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp diff --git a/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp b/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp new file mode 100644 index 00000000000000..e9743594ad8711 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp @@ -0,0 +1,181 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "openvino/openvino.hpp" + +struct InputInfo { + ov::element::Type type; + ov::PartialShape partialShape; + ov::Shape dataShape; + ov::Layout layout; +}; +using InputsInfo = std::map; + +std::string parameterNameToTensorName(std::string& name, std::vector>& inputs_info) { + auto count_name = std::any_of(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return port.get_names().count(name) > 0; + }); + if (count_name) { + return name; + } else { + auto inputInfo = std::find_if(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return name == port.get_node()->get_friendly_name(); + }); + if (inputInfo == inputs_info.end()) { + throw std::runtime_error("Provided I/O name \"" + name + + "\" is not found neither in tensor names nor in nodes names."); + } + return inputInfo->get_any_name(); + } +} + +std::map> parseInputParameters(std::string& parameter_string, + std::vector>& input_info) { + // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all + // inputs) + std::map> return_value; + std::string search_string = parameter_string; + auto start_pos = search_string.find_first_of('['); + auto input_name = search_string.substr(0, start_pos); + while (start_pos != std::string::npos) { + auto end_pos = search_string.find_first_of(']'); + if (end_pos == std::string::npos) + break; + input_name = search_string.substr(0, start_pos); + auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); + if (!input_name.empty()) { + return_value[parameterNameToTensorName(input_name, input_info)].push_back(input_value); + } else { + for (auto& item : input_info) { + return_value[item.get_any_name()].push_back(input_value); + } + } + search_string = search_string.substr(end_pos + 1); + if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) + break; + if (search_string.front() == ',') { + if (search_string.length() > 1) + search_string = search_string.substr(1); + else + throw std::logic_error("Can't parse input parameter string, there is nothing after the comma " + + parameter_string); + } + start_pos = search_string.find_first_of('['); + } + if (!search_string.empty()) + throw std::logic_error("Can't parse input parameter string: " + parameter_string); + return return_value; +} + +void boundDynamicShape(std::shared_ptr& model) { + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + if (shape.is_static()) { + continue; + } + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + " Setting batch to 1 forcibly" + << std::endl; + ov::set_batch(model, 1); + } + shape = item->get_partial_shape(); + if (shape.is_dynamic()) { + throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + + " is dynamic which is not supported by NPU"); + } + } +} + +void setModelBatch(std::shared_ptr& model, uint32_t batch = 1) { + if (batch == 1) { + return; + } + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + throw std::logic_error("ERROR: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + "Cannot apply fixed batch: " + + std::to_string(batch) + + ". Please remove the parameter from config: \"override_model_batch_size\""); + } + ov::set_batch(model, batch); + } +} + +void reshape(ov::OutputVector inputsInfo, InputsInfo& infoMap, std::shared_ptr& model, + std::string& shapeString, int overrideModelBatchSize, std::string_view device) { + std::vector infoMaps; + if (!shapeString.empty()) { + std::map> shapesMap = parseInputParameters(shapeString, inputsInfo); + + if (overrideModelBatchSize != 1) { + throw std::logic_error(R"(Incompatible params: "shape" and "override_model_batch_size")"); + } + for (auto& item : inputsInfo) { + InputInfo info; + auto name = item.get_any_name(); + + if (!shapesMap.empty()) { + if (shapesMap.count(name)) { + if (shapesMap.at(name).size() > 1) { + // Example: -shape input1[..][..] + throw std::logic_error("shape command line parameter doesn't support multiple " + "shapes for one input."); + } + info.partialShape = shapesMap.at(name)[0]; + } else { + info.partialShape = item.get_partial_shape(); + } + } + infoMap[name] = std::move(info); + infoMaps.push_back(infoMap); + } + std::map newShapes; + for (auto& item : infoMaps) { + for (auto& map : item) { + if (!newShapes.count(map.first)) { + newShapes[map.first] = map.second.partialShape; + } + } + } + model->reshape(newShapes); + } else { + if (device.find("NPU") != std::string::npos || + // FIXME: SIT on CPU also requires to bound dynamic shapes + device.find("CPU") != std::string::npos || device.find("TEMPLATE") != std::string::npos) { + boundDynamicShape(model); + } + + setModelBatch(model, overrideModelBatchSize); + } +} diff --git a/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt index 66ff751b9f5162..fc485030359428 100644 --- a/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt +++ b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt @@ -24,7 +24,8 @@ ov_add_target(ADD_CPPLINT PRIVATE openvino::runtime gflags - Threads::Threads) + Threads::Threads + npu_tools_utils) set_target_properties(${TARGET_NAME} PROPERTIES FOLDER ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/src/plugins/intel_npu/tools/compile_tool/main.cpp b/src/plugins/intel_npu/tools/compile_tool/main.cpp index 471fd55bb82b3f..7a088d1afc69e2 100644 --- a/src/plugins/intel_npu/tools/compile_tool/main.cpp +++ b/src/plugins/intel_npu/tools/compile_tool/main.cpp @@ -14,11 +14,12 @@ #include -#include "openvino/core/partial_shape.hpp" -#include "openvino/openvino.hpp" +#include +#include #include "tools_helpers.hpp" + static constexpr char help_message[] = "Optional. Print the usage message."; static constexpr char model_message[] = "Required. Path to the XML model."; @@ -168,64 +169,6 @@ bool isFP32(const ov::element::Type& type) { return type == ov::element::f32; } -void boundDynamicShape(std::shared_ptr& model) { - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - if (shape.is_static()) { - continue; - } - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by NPU"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by NPU\n" - " Setting batch to 1 forcibly" - << std::endl; - ov::set_batch(model, 1); - } - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by NPU"); - } - } -} - -void setModelBatch(std::shared_ptr& model, uint32_t batch = 1) { - if (batch == 1) { - return; - } - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by NPU"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - throw std::logic_error("ERROR: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by NPU\n" - "Cannot apply fixed batch: " + - std::to_string(batch) + - ". Please remove the parameter from config: \"override_model_batch_size\""); - } - ov::set_batch(model, batch); - } -} - void configurePrePostProcessing(std::shared_ptr& model, const std::string& ip, const std::string& op, const std::string& iop, const std::string& il, const std::string& ol, const std::string& iol, const std::string& iml, const std::string& oml, @@ -475,50 +418,6 @@ std::string getFileNameFromPath(const std::string& path, using TimeDiff = std::chrono::milliseconds; -void reshape(ov::OutputVector inputs_info, InputsInfo& info_map, std::shared_ptr& model) { - std::vector info_maps; - if (!FLAGS_shape.empty()) { - std::map> shapes_map = parseInputParameters(FLAGS_shape, inputs_info); - - if (FLAGS_override_model_batch_size != 1) { - throw std::logic_error("Incompatible params: \"shape\" and \"override_model_batch_size\""); - } - for (auto& item : inputs_info) { - InputInfo info; - auto name = item.get_any_name(); - - if (!shapes_map.empty()) { - if (shapes_map.count(name)) { - if (shapes_map.at(name).size() > 1) { - // Example: -shape input1[..][..] - throw std::logic_error("shape command line parameter doesn't support multiple " - "shapes for one input."); - } - info.partialShape = shapes_map.at(name)[0]; - } else { - info.partialShape = item.get_partial_shape(); - } - } - info_map[name] = std::move(info); - info_maps.push_back(info_map); - } - std::map newShapes; - for (auto& item : info_maps) { - for (auto& map : item) { - if (!newShapes.count(map.first)) { - newShapes[map.first] = map.second.partialShape; - } - } - } - model->reshape(newShapes); - } else { - if (FLAGS_d.find("NPU") != std::string::npos) { - boundDynamicShape(model); - } - - setModelBatch(model, FLAGS_override_model_batch_size); - } -} int main(int argc, char* argv[]) { try { @@ -552,7 +451,7 @@ int main(int argc, char* argv[]) { InputsInfo info_map; std::cout << "Performing reshape" << std::endl; - reshape(std::move(inputs_info), info_map, model); + reshape(std::move(inputs_info), info_map, model, FLAGS_shape, FLAGS_override_model_batch_size, FLAGS_d); std::cout << "Configuring model pre & post processing" << std::endl; configurePrePostProcessing(model, FLAGS_ip, FLAGS_op, FLAGS_iop, FLAGS_il, FLAGS_ol, FLAGS_iol, FLAGS_iml, diff --git a/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp b/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp deleted file mode 100644 index 6d42fd142b8971..00000000000000 --- a/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "openvino/openvino.hpp" - -struct InputInfo { - ov::element::Type type; - ov::PartialShape partialShape; - ov::Shape dataShape; - ov::Layout layout; -}; -using InputsInfo = std::map; - -std::string parameterNameToTensorName(std::string& name, std::vector>& inputs_info) { - auto count_name = std::any_of(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { - return port.get_names().count(name) > 0; - }); - if (count_name) { - return name; - } else { - auto inputInfo = std::find_if(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { - return name == port.get_node()->get_friendly_name(); - }); - if (inputInfo == inputs_info.end()) { - throw std::runtime_error("Provided I/O name \"" + name + - "\" is not found neither in tensor names nor in nodes names."); - } - return inputInfo->get_any_name(); - } -} - -std::map> parseInputParameters(std::string& parameter_string, - std::vector>& input_info) { - // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all - // inputs) - std::map> return_value; - std::string search_string = parameter_string; - auto start_pos = search_string.find_first_of('['); - auto input_name = search_string.substr(0, start_pos); - while (start_pos != std::string::npos) { - auto end_pos = search_string.find_first_of(']'); - if (end_pos == std::string::npos) - break; - input_name = search_string.substr(0, start_pos); - auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); - if (!input_name.empty()) { - return_value[parameterNameToTensorName(input_name, input_info)].push_back(input_value); - } else { - for (auto& item : input_info) { - return_value[item.get_any_name()].push_back(input_value); - } - } - search_string = search_string.substr(end_pos + 1); - if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) - break; - if (search_string.front() == ',') { - if (search_string.length() > 1) - search_string = search_string.substr(1); - else - throw std::logic_error("Can't parse input parameter string, there is nothing after the comma " + - parameter_string); - } - start_pos = search_string.find_first_of('['); - } - if (!search_string.empty()) - throw std::logic_error("Can't parse input parameter string: " + parameter_string); - return return_value; -} diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp index 4018982b022ed3..5658c18650243b 100644 --- a/src/plugins/intel_npu/tools/single-image-test/main.cpp +++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp @@ -4,9 +4,11 @@ // #include "image_quality_helper.hpp" +#include "openvino/core/partial_shape.hpp" #include "semantic_segmentation_helpers.hpp" #include "tensor_utils.hpp" #include "yolo_helpers.hpp" +#include "tools_helpers.hpp" #include #include @@ -31,7 +33,8 @@ using TensorMap = std::map; struct TensorDescriptor { ov::element::Type precision; - ov::Shape shape; + ov::PartialShape shape; + ov::Shape dataShape; ov::Layout layout; }; @@ -83,6 +86,15 @@ DEFINE_string(oml, "", " is supported"); DEFINE_bool(img_as_bin, false, "Force binary input even if network expects an image"); DEFINE_bool(pc, false, "Report performance counters"); +DEFINE_string( + shape, "", + "Optional. Set shape for model input. For example, \"input1[1,3,224,224],input2[1,4]\" or \"[1,3,224,224]\"" + " in case of one input size. This parameter affects model input shape and can be dynamic." + " For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?]." + " For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?]."); +DEFINE_string(data_shape, "", + "Required for models with dynamic shapes. Set shape for input blobs. Only one shape can be set." + "In case of one input size: \"[1,3,224,224]\""); // for using input image mean and scale static constexpr char mean_values_message[] = @@ -1450,65 +1462,6 @@ std::pair runInfer(ov::InferRequest& inferRequest, ov::Compi return std::make_pair(out, profData); } -void boundDynamicShape(std::shared_ptr& model) { - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - if (shape.is_static()) { - continue; - } - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by SIT"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by SIT\n" - " Setting batch to 1 forcibly" - << std::endl; - ov::set_batch(model, 1); - } - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by SIT"); - } - } -} - -void setModelBatch(std::shared_ptr& model, uint32_t batch) { - if (batch == 1) { - return; - } - - // New batch value is applicable if the model has non dynamic inputs/outputs only - // Amend layout by adding N if it has no batch dimension - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by SIT"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - } - - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by SIT"); - } - } - ov::set_batch(model, batch); -} - // FIXME: User must provide layout explicitly. // No "default" layout for IRv11 models. static ov::Layout getLayoutByRank(const size_t rank) { @@ -1558,8 +1511,8 @@ bool testSSDDetection(const TensorMap& outputs, const TensorMap& references, const ov::Tensor& reference = references.begin()->second; const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); auto confThresh = FLAGS_confidence_threshold; auto probTolerance = FLAGS_prob_tolerance; @@ -1592,8 +1545,8 @@ bool testYoloV2(const TensorMap& outputs, const TensorMap& references, const Ten const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; double boxTolerance = FLAGS_box_tolerance; @@ -1624,8 +1577,8 @@ bool testYoloV3(const TensorMap& outputs, const TensorMap& references, const Ten "Mismatch between the number of model outputs and the number of references"); const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; @@ -1663,8 +1616,8 @@ bool testYoloV4(const TensorMap& outputs, const TensorMap& references, const Ten "Mismatch between the number of model outputs and the number of references"); const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; @@ -1733,6 +1686,16 @@ bool testMeanIoU(const TensorMap& outputs, const TensorMap& references, const La return compare_mean_IoU(iou, semSegThreshold, classes); } +static ov::Shape parseDataShape(const std::string& dataShapeStr) { + std::vector dataShape; + std::istringstream ss(dataShapeStr); + std::string token; + while (std::getline(ss, token, ',')) { + dataShape.push_back(std::stoul(token)); + } + return ov::Shape(dataShape); +} + static int runSingleImageTest() { std::cout << "Run single image test" << std::endl; try { @@ -1814,12 +1777,12 @@ static int runSingleImageTest() { auto model = core.read_model(FLAGS_network); nameIOTensors(model); - setModelBatch(model, FLAGS_override_model_batch_size); - if (FLAGS_device.find("NPU") != std::string::npos || - // FIXME: SIT on CPU also requires to bound dynamic shapes - FLAGS_device.find("CPU") != std::string::npos || FLAGS_device.find("TEMPLATE") != std::string::npos) { - boundDynamicShape(model); - } + auto inputs_info = std::const_pointer_cast(model)->inputs(); + InputsInfo info_map; + + std::cout << "Performing reshape" << std::endl; + reshape(std::move(inputs_info), info_map, model, FLAGS_shape, + FLAGS_override_model_batch_size, FLAGS_device); ov::preprocess::PrePostProcessor ppp(model); @@ -1856,11 +1819,11 @@ static int runSingleImageTest() { inModelLayout.has_value()) { inLayerModelLayout = inModelLayout.value(); } else { - const auto shape = inputInfo[i].get_shape(); + const auto shape = inputInfo[i].get_partial_shape(); inLayerModelLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Configuring preprocessing. Since --iml option isn't set, input model " "layout for layer \"" - << inputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << inputInfo[i].get_any_name() << "\" is infered from shape: " << shape.to_string() << " rank (" << shape.size() << ") as " << inLayerModelLayout.to_string() << std::endl; } @@ -1917,11 +1880,11 @@ static int runSingleImageTest() { outModelLayout.has_value()) { outLayerModelLayout = outModelLayout.value(); } else { - const auto shape = outputInfo[i].get_shape(); + const auto shape = outputInfo[i].get_partial_shape(); outLayerModelLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Configuring preprocessing. Since --oml option isn't set, output model " "layout for layer \"" - << outputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << outputInfo[i].get_any_name() << "\" is infered from shape: " << shape.to_shape() << " rank (" << shape.size() << ") as " << outLayerModelLayout.to_string() << std::endl; } @@ -1933,6 +1896,7 @@ static int runSingleImageTest() { } } + std::cout << "Compile model" << std::endl; compiledModel = core.compile_model(ppp.build(), FLAGS_device); } else { std::cout << "Import network " << FLAGS_network << std::endl; @@ -1994,7 +1958,8 @@ static int runSingleImageTest() { // Load the input data for (const auto& inputInfo : inputsInfo) { - const ov::Shape& shape = inputInfo.get_shape(); + const auto& shape = inputInfo.get_partial_shape(); + const auto dataShape = shape.is_static() ? shape.get_shape() : parseDataShape(FLAGS_data_shape); const ov::element::Type& precision = inputInfo.get_element_type(); // Determine the input layout @@ -2012,19 +1977,20 @@ static int runSingleImageTest() { inputLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Loading input data. Since --iml option isn't set, input model layout for " "layer \"" - << inputInfo.get_any_name() << "\" is infered from shape: " << toString(shape) + << inputInfo.get_any_name() << "\" is infered from shape: " << shape.to_shape() << " rank (" << shape.size() << ") as " << inputLayout.to_string() << std::endl; } - inputDescriptors.emplace(inputInfo.get_any_name(), TensorDescriptor{precision, shape, inputLayout}); + inputDescriptors.emplace(inputInfo.get_any_name(), TensorDescriptor{precision, shape, + dataShape, inputLayout}); std::cout << "Load input #" << inputInd << " from " << inputFiles[inputInd] << " as " << precision << " " << inputLayout.to_string() << " " << shape << std::endl; const ov::Tensor tensor = !FLAGS_img_as_bin - ? loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format) - : loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format, + ? loadInput(precision, dataShape, inputLayout, inputFiles[inputInd], FLAGS_color_format) + : loadInput(precision, dataShape, inputLayout, inputFiles[inputInd], FLAGS_color_format, inputBinPrecisionForOneInfer[numberOfTestCase][inputInd]); std::ostringstream ostr; ostr << netFileName << "_input_" << inputInd << "_case_" << numberOfTestCase << ".blob"; From c902a0144a45aff068c15726fb27773feaa1f2ea Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Thu, 31 Oct 2024 19:33:21 +0900 Subject: [PATCH 130/233] [GPU] update onednn (#27349) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 062d247e7853b1..1722066ad4c0f1 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 062d247e7853b14ed287a130cc2dc221187430aa +Subproject commit 1722066ad4c0f15495f2d0fcbe9deb2bfd188c36 From a0b73e0a7a69873582301a460365792183101ab3 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Thu, 31 Oct 2024 12:07:18 +0100 Subject: [PATCH 131/233] [PyOV] Extend Python API with `Squeeze-15` (#27281) ### Details: - This PR includes commits from https://github.com/openvinotoolkit/openvino/pull/26995 ### Tickets: - CVS-154024 --------- Signed-off-by: p-wysocki Co-authored-by: Michal Barnas Co-authored-by: Roman Kazantsev Co-authored-by: Michal Lukaszewski --- .../src/openvino/runtime/opset15/__init__.py | 2 +- .../src/openvino/runtime/opset15/ops.py | 39 ++++++++++++++ .../python/tests/test_graph/test_ops_fused.py | 11 ---- .../python/tests/test_graph/test_squeeze.py | 51 +++++++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) create mode 100644 src/bindings/python/tests/test_graph/test_squeeze.py diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py index 6cc9c24827a85f..c4dd48d9087ae1 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py @@ -188,7 +188,7 @@ from openvino.runtime.opset1.ops import split from openvino.runtime.opset1.ops import sqrt from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze +from openvino.runtime.opset15.ops import squeeze from openvino.runtime.opset15.ops import stft from openvino.runtime.opset1.ops import strided_slice from openvino.runtime.opset1.ops import subtract diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py index b3a131602af703..93aacb29572340 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py @@ -348,3 +348,42 @@ def search_sorted( inputs = as_nodes(sorted_sequence, values, name=name) attributes = {"right_mode": right_mode} return _get_node_factory_opset15().create("SearchSorted", inputs, attributes) + + +@nameable_op +def squeeze( + data: NodeInput, + axes: Optional[NodeInput] = None, + allow_axis_skip: bool = False, + name: Optional[str] = None, +) -> Node: + """Perform squeeze operation on input tensor. + + :param data: The node with data tensor. + :param axes: Optional list of integers, indicating the dimensions to squeeze. + Negative indices are supported. One of: input node or array. + :param allow_axis_skip: If true, shape inference results in a dynamic rank, when + selected axis has value 1 in its dynamic range. Used only if axes input + is given. Defaults to false. + :param name: Optional new name for output node. + :return: The new node performing a squeeze operation on input tensor. + + Remove single-dimensional entries from the shape of a tensor. + Takes an optional parameter `axes` with a list of axes to squeeze. + If `axes` is not provided, all the single dimensions will be removed from the shape. + + For example: + + Inputs: tensor with shape [1, 2, 1, 3, 1, 1], axes=[2, 4] + + Result: tensor with shape [1, 2, 3, 1] + """ + if axes is None: + inputs = as_nodes(data, name=name) + else: + inputs = as_nodes(data, axes, name=name) + return _get_node_factory_opset15().create( + "Squeeze", + inputs, + {"allow_axis_skip": allow_axis_skip} + ) diff --git a/src/bindings/python/tests/test_graph/test_ops_fused.py b/src/bindings/python/tests/test_graph/test_ops_fused.py index bdbf4a1a9f1f9c..2bab743bfd7afb 100644 --- a/src/bindings/python/tests/test_graph/test_ops_fused.py +++ b/src/bindings/python/tests/test_graph/test_ops_fused.py @@ -110,17 +110,6 @@ def test_clamp_operator(): assert list(model.get_output_shape(0)) == [2, 2] -def test_squeeze_operator(): - data_shape = [1, 2, 1, 3, 1, 1] - parameter_data = ov.parameter(data_shape, name="Data", dtype=np.float32) - axes = [2, 4] - model = ov.squeeze(parameter_data, axes) - - assert model.get_type_name() == "Squeeze" - assert model.get_output_size() == 1 - assert list(model.get_output_shape(0)) == [1, 2, 3, 1] - - def test_squared_difference_operator(): x1_shape = [1, 2, 3, 4] x2_shape = [2, 3, 4] diff --git a/src/bindings/python/tests/test_graph/test_squeeze.py b/src/bindings/python/tests/test_graph/test_squeeze.py new file mode 100644 index 00000000000000..869d84a0414841 --- /dev/null +++ b/src/bindings/python/tests/test_graph/test_squeeze.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino.runtime.opset1 as ov_opset1 +import openvino.runtime.opset15 as ov_opset15 +import numpy as np +import pytest + + +def test_squeeze_v1_operator(): + data_shape = [1, 2, 1, 3, 1, 1] + parameter_data = ov_opset1.parameter(data_shape, name="Data", dtype=np.float32) + axes = [2, 4] + model = ov_opset1.squeeze(parameter_data, axes) + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == [1, 2, 3, 1] + + +@pytest.mark.parametrize(("input_shape", "axes", "allow_axis_skip", "expected_shape"), [ + ((1, 2, 1, 3, 1, 1), [1, 2, 4], True, [1, 2, 3, 1]), + ((1, 2, 1, 3, 1, 1), [1, 2, 4], False, [1, 2, 3, 1]), + ((2, -1, 3), [1], False, [2, 3]) +]) +def test_squeeze_v15_operator(input_shape, axes, allow_axis_skip, expected_shape): + parameter_data = ov_opset15.parameter(input_shape, name="Data", dtype=np.float32) + model = ov_opset15.squeeze(parameter_data, axes, allow_axis_skip, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == expected_shape + + +def test_squeeze_v15_dynamic_rank_output(): + parameter_data = ov_opset15.parameter((2, -1, 3), name="Data", dtype=np.float32) + model = ov_opset15.squeeze(parameter_data, [1], True, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert model.get_output_partial_shape(0).to_string() == "[...]" + + +def test_squeeze_v15_axes_not_given(): + parameter_data = ov_opset15.parameter((1, 3, 1, 1, 3, 5), name="Data", dtype=np.float32) + model = ov_opset15.squeeze(data=parameter_data, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == [3, 3, 5] From b9a94c3f8b83deb41ba2e748150d70157784f96b Mon Sep 17 00:00:00 2001 From: Ivan Tikhonov Date: Thu, 31 Oct 2024 15:08:57 +0400 Subject: [PATCH 132/233] [ONNX] Update DequantizeLinear21 converter (#27351) ### Details: Aligned with the canonical form of the dequantization subgraph. Reshape op has been moved up right after the Constant, it will be const folded in MOC, this is ok, Reshape const folding doesn't copy a constant, just copies a pointer. And ConvertLike were replaced with Convert. Perhaps that's a pretty rough change and we need to add a check here that the scale is a contant. And in that case use Convert instead of ConvertLike, if scale is not a constant, maybe we should leave ConvertLike. ### Tickets: - *https://jira.devtools.intel.com/browse/CVS-156329* --- .../frontend/src/op/dequantize_linear.cpp | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index b09bc73467bc10..d7b5214f3e53f4 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -221,19 +221,8 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { FRONT_END_GENERAL_CHECK(src_x.get_partial_shape().is_static(), "DequantizeLinear cannot operate with dynamic shapes of input X"); - const auto& unsqueezed_axes = std::make_shared(ov::element::i64, Shape{1}, std::vector{1}); - - if (inputs.size() > 2) { - zp = inputs[2]; - if (zp.get_element_type() != scale.get_element_type()) { - zp = std::make_shared(zp, scale); - } - zp = std::make_shared(zp, unsqueezed_axes); - } - const auto axis = node.get_attribute_value("axis", 1); const auto block_size = static_cast(node.get_attribute_value("block_size", 0)); - const auto scale_type = scale.get_element_type(); FRONT_END_GENERAL_CHECK(axis == 0, "Axis != 0 isn't supported"); FRONT_END_GENERAL_CHECK(block_size > 0, "block_size must be greater than zero"); @@ -241,16 +230,30 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { src_x.get_shape()[0] % block_size == 0, "DequantizeLinear doesn't support case when first dimension of X cannot be divided by block_size"); - const auto& x = src_x.get_element_type() == scale_type ? src_x : std::make_shared(src_x, scale); + ov::Output broadcastable_x = op::util::reshape( + src_x, + Shape{static_cast(src_x.get_shape()[0]) / block_size, block_size, src_x.get_shape()[1]}); + + const auto& unsqueezed_axes = std::make_shared(ov::element::i64, Shape{1}, std::vector{1}); + + const auto scale_type = scale.get_element_type(); + if (inputs.size() > 2) { + zp = inputs[2]; + if (zp.get_element_type() != scale.get_element_type()) { + zp = std::make_shared(zp, scale_type); + } + zp = std::make_shared(zp, unsqueezed_axes); + } + + const auto& x = src_x.get_element_type() == scale_type ? broadcastable_x + : std::make_shared(broadcastable_x, scale_type); // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]] - ov::Output broadcastable_x = - op::util::reshape(x, Shape{static_cast(x.get_shape()[0]) / block_size, block_size, x.get_shape()[1]}); // Adding additional dimension for broadcasting scale = std::make_shared(scale, unsqueezed_axes); if (zp.get_node_shared_ptr()) { - broadcastable_x = std::make_shared(broadcastable_x, zp); + broadcastable_x = std::make_shared(x, zp); } const auto& scaled_x = std::make_shared(broadcastable_x, scale); From a488aec3812c8998028bab7e5996bb1c057f162e Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Fri, 1 Nov 2024 09:20:33 +0400 Subject: [PATCH 133/233] [TF FE] Run string ops tests on ARM (#27367) **Details:** Since openvino-tokenizers is build for ARM in the precommit, we are ready to switch on String ops tests **Ticket:** TBD Signed-off-by: Kazantsev, Roman --- .../tensorflow_tests/test_tf_LookupTableFind.py | 8 -------- .../tensorflow_tests/test_tf_RaggedTensorToSparse.py | 6 ------ .../tensorflow_tests/test_tf_RaggedTensorToTensor.py | 10 ---------- .../tensorflow_tests/test_tf_StaticRegexReplace.py | 6 ------ .../tensorflow_tests/test_tf_StringLower.py | 10 ---------- .../tensorflow_tests/test_tf_StringSplitV2.py | 6 ------ .../tensorflow_tests/test_tf_StringToHashBucketFast.py | 6 ------ 7 files changed, 52 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py index bd1422f8719cea..97177a5adeec13 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -91,12 +89,6 @@ def test_lookup_table_find(self, hash_table_type, keys_shape, params, ie_device, if ie_device == 'GPU' or run_in_jenkins(): pytest.skip("operation extesion is not supported on GPU or " "No layout format available for gather:LookupTableFind issue") - if params['keys_type'] == str: - if platform.system() in ('Darwin') or platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', - 'ARM64']: - pytest.xfail(reason='126314, 132699: Build tokenizers for ARM and MacOS') self._test(*self.create_lookup_table_find_net(hash_table_type=hash_table_type, keys_shape=keys_shape, **params), ie_device, precision, ir_version, temp_dir=temp_dir, diff --git a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py index 621b8430f64fdc..f0832676f0f982 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py @@ -1,8 +1,6 @@ # Copyright (C) 2022-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -55,10 +53,6 @@ def create_ragged_tensor_to_sparse_net(self, rt_dense_values_shape, rt_dense_val ]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_sparse(self, rt_dense_values_shape, rt_dense_values_type, rt_nested_splits, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU' or run_in_jenkins(): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py index 39afde0a2c6b08..0267874eb98b35 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py @@ -1,8 +1,6 @@ # Copyright (C) 2022-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -52,10 +50,6 @@ def create_ragged_tensor_to_tensor_net(self, shape_type, shape_value, values_sha @pytest.mark.parametrize('row_partition_types', [["ROW_SPLITS"]]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_tensor(self, shape_type, shape_value, values_shape, values_type, default_value, row_partition_tensors, row_partition_types, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): @@ -110,10 +104,6 @@ def create_ragged_tensor_to_tensor_net(self, shape_type, shape_value, values_sha @pytest.mark.parametrize('row_partition_types', [["FIRST_DIM_SIZE", "VALUE_ROWIDS"]]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_tensor(self, shape_type, shape_value, values_shape, values_type, default_value, row_partition_tensors, row_partition_types, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py b/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py index ef5e135537eb84..a3fa91ad0976f5 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -41,10 +39,6 @@ def create_static_regex_replace_net(self, input_shape, pattern, rewrite, replace @pytest.mark.parametrize('replace_global', [None, True, False]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_static_regex_replace(self, input_shape, pattern, rewrite, replace_global, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py index f4c9e7260d7afb..5787c0b8318801 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py @@ -3,7 +3,6 @@ import numpy as np import os -import platform import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest @@ -46,10 +45,6 @@ def create_string_lower_net(self, input_shape, encoding, strings_dictionary): ['第一句話在這裡', '第二句話在這裡', '第三句話在這裡']]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_lower(self, input_shape, encoding, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU' or run_in_jenkins(): @@ -78,10 +73,6 @@ def create_string_lower_model(self, output_dir): @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_lower_with_ovc(self, ie_device, temp_dir, precision): if ie_device == 'GPU' or run_in_jenkins(): pytest.skip("operation extension is not supported on GPU") @@ -90,7 +81,6 @@ def test_string_lower_with_ovc(self, ie_device, temp_dir, precision): return_code, _, _ = generate_ir_ovc(input_model_path, {'output_model': output_model_path}) assert return_code == 0, "OVC tool is failed for conversion model {}".format(input_model_path) - import openvino_tokenizers import openvino as ov core = ov.Core() compiled_model = core.compile_model(output_model_path, ie_device) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py b/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py index 3745d07926bc43..84d7c269ce598f 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -42,10 +40,6 @@ def create_string_split_v2_net(self, input_shape, sep, maxsplit): @pytest.mark.parametrize('maxsplit', [None, -1, 5, 10]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_string_split_v2(self, input_shape, sep, maxsplit, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py b/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py index 08812fe7b46228..5fefb8117f3dcf 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -45,10 +43,6 @@ def create_string_to_hash_bucket_fast_net(self, input_shape, strings_dictionary, ['', ' ', '12345 ']]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_to_hash_bucket_fast(self, input_shape, num_buckets, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): From 5833781ddbc476d77cf5593f1f8b34758988b9a8 Mon Sep 17 00:00:00 2001 From: Georgy Krivoruchko Date: Fri, 1 Nov 2024 12:03:18 +0400 Subject: [PATCH 134/233] [ONNX] Disabled constant folding for Subtract branch of DequantizeLinear-21 (#27359) ### Details: - Disabled constant folding for Subtract branch of DequantizeLinear-21 ### Tickets: - 156329 --- src/frontends/onnx/frontend/src/op/dequantize_linear.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index d7b5214f3e53f4..47fcc7af60bf61 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -18,6 +18,7 @@ #include "openvino/op/subtract.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/unsqueeze.hpp" +#include "transformations/rt_info/disable_constant_folding.hpp" #include "utils/common.hpp" #include "utils/reshape.hpp" using namespace ov::op; @@ -241,6 +242,7 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { zp = inputs[2]; if (zp.get_element_type() != scale.get_element_type()) { zp = std::make_shared(zp, scale_type); + disable_constant_folding(zp.get_node_shared_ptr()); } zp = std::make_shared(zp, unsqueezed_axes); } From af389b482381b445a3d7bb6ed6c7de3a5320da87 Mon Sep 17 00:00:00 2001 From: Evgenya Nugmanova Date: Fri, 1 Nov 2024 12:14:46 +0400 Subject: [PATCH 135/233] Broadcast: symbol propagation (#27357) ### Details: - *Improves symbol propagation in LLMs and allows for better ShapeOf optimization* Signed-off-by: Evgeniia Nugmanova --- src/core/include/openvino/op/util/broadcast_base.hpp | 1 + src/core/src/op/util/broadcast_base.cpp | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/src/core/include/openvino/op/util/broadcast_base.hpp b/src/core/include/openvino/op/util/broadcast_base.hpp index 2e500eb611c04c..6300559ac8cf00 100644 --- a/src/core/include/openvino/op/util/broadcast_base.hpp +++ b/src/core/include/openvino/op/util/broadcast_base.hpp @@ -63,6 +63,7 @@ class OPENVINO_API BroadcastBase : public Op { bool evaluate_lower(TensorVector& outputs) const override; bool evaluate_upper(TensorVector& outputs) const override; + bool evaluate_symbol(ov::TensorSymbolVector& output_symbols) const override; PartialShape get_result_shape_pdpd(const PartialShape& arg0_shape, const PartialShape& target_shape, diff --git a/src/core/src/op/util/broadcast_base.cpp b/src/core/src/op/util/broadcast_base.cpp index 59154e45e2b37a..c2c838afeb38bd 100644 --- a/src/core/src/op/util/broadcast_base.cpp +++ b/src/core/src/op/util/broadcast_base.cpp @@ -471,3 +471,10 @@ bool ov::op::util::BroadcastBase::evaluate_upper(ov::TensorVector& output_values return false; return default_upper_bound_evaluator(this, output_values); } + +bool ov::op::util::BroadcastBase::evaluate_symbol(ov::TensorSymbolVector& output_symbols) const { + if (!input_value(1).get_tensor().has_and_set_bound() || + (get_input_size() > 2 && !input_value(2).get_tensor().has_and_set_bound())) + return false; + return default_symbol_evaluator(this, {0}, output_symbols); +} From caa1e6af13139692a34cf37787c9c79f949bcaaa Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Fri, 1 Nov 2024 10:42:50 +0200 Subject: [PATCH 136/233] [NPU] Create compiler adapter class (#27006) ### Details: - *Create a new CompilerAdapter interface that hides different implementations of CIP and CID* - *iCompiler remains an interface only for CIP. This keeps CIP (developed in another repository) decoupled from L0* - we still use NetworkMetadata in the plugin flow, which needs to be decided later if is still needed or if it can be removed - *Graph object is created by compiler_adapter* - *Backend doesn't create/initialize graph any longer* - *Moving common objects for backend and compiler_adapter to utils/zero/* - *Destroy blob on the import path after we load the weights into the NPU memory* - *Create a new property to postpone weights loading until the creation of the first inference request, by default is performed right after the model is compiled - NPU_DEFER_WEIGHTS_LOAD* A short description of the new format: ![Screenshot 2024-10-30 151129](https://github.com/user-attachments/assets/89f86c36-f3e8-4906-8394-7cd0ae5617a2) ### Tickets: - *CVS-153081* --------- Signed-off-by: Bogdan Pereanu --- src/plugins/intel_npu/README.md | 2 +- src/plugins/intel_npu/cmake/features.cmake | 25 +- src/plugins/intel_npu/src/CMakeLists.txt | 13 +- .../al/include/intel_npu/config/runtime.hpp | 32 + .../src/al/include/intel_npu/icompiler.hpp | 162 +-- .../al/include/intel_npu/network_metadata.hpp | 127 ++ .../intel_npu/npu_private_properties.hpp | 7 + .../intel_npu/src/al/src/config/runtime.cpp | 1 + .../intel_npu/src/backend/CMakeLists.txt | 29 - .../src/backend/include/zero_backend.hpp | 9 +- .../src/backend/include/zero_device.hpp | 10 +- .../src/backend/include/zero_executor.hpp | 86 -- .../src/backend/include/zero_host_tensor.hpp | 2 +- .../backend/include/zero_infer_request.hpp | 9 +- .../src/backend/include/zero_memory.hpp | 2 +- .../src/backend/include/zero_pipeline.hpp | 13 +- .../src/backend/include/zero_profiling.hpp | 2 +- .../backend/include/zero_remote_tensor.hpp | 2 +- .../src/backend/src/zero_backend.cpp | 28 +- .../intel_npu/src/backend/src/zero_device.cpp | 36 +- .../src/backend/src/zero_executor.cpp | 187 --- .../src/backend/src/zero_infer_request.cpp | 74 +- .../src/backend/src/zero_pipeline.cpp | 62 +- .../intel_npu/src/common/CMakeLists.txt | 2 +- .../intel_npu/common/icompiled_model.hpp | 11 +- .../include/intel_npu/common/igraph.hpp | 103 ++ .../common/include/intel_npu/common/npu.hpp | 16 +- .../intel_npu/common/sync_infer_request.hpp | 3 +- .../src/common/src/sync_infer_request.cpp | 2 +- .../include/driver_compiler_adapter.hpp | 50 - .../include/zero_compiler_in_driver.hpp | 201 --- .../compiler/src/driver_compiler_adapter.cpp | 130 -- .../compiler/src/zero_compiler_in_driver.cpp | 1081 ----------------- .../CMakeLists.txt | 7 +- .../include/custom_stream_buffer.hpp | 4 +- .../include/driver_compiler_adapter.hpp | 64 + .../compiler_adapter/include/driver_graph.hpp | 50 + .../include/ir_serializer.hpp} | 4 +- .../include/plugin_compiler_adapter.hpp | 37 + .../compiler_adapter/include/plugin_graph.hpp | 49 + .../include/ze_graph_ext_wrappers.hpp | 159 +++ .../ze_graph_ext_wrappers_interface.hpp | 42 + .../src/driver_compiler_adapter.cpp | 606 +++++++++ .../src/compiler_adapter/src/driver_graph.cpp | 164 +++ .../src/ir_serializer.cpp} | 8 +- .../src/plugin_compiler_adapter.cpp | 160 +++ .../src/compiler_adapter/src/plugin_graph.cpp | 132 ++ .../src/precomp.hpp | 0 .../src/ze_graph_ext_wrappers.cpp | 568 +++++++++ .../intel_npu/src/plugin/CMakeLists.txt | 30 +- .../src/plugin/include/compiled_model.hpp | 37 +- .../intel_npu/src/plugin/include/compiler.hpp | 20 - .../intel_npu/src/plugin/include/plugin.hpp | 4 +- .../intel_npu/src/plugin/src/backends.cpp | 7 +- .../src/plugin/src/compiled_model.cpp | 138 +-- .../intel_npu/src/plugin/src/compiler.cpp | 101 -- .../intel_npu/src/plugin/src/plugin.cpp | 82 +- .../intel_npu/utils/zero}/zero_init.hpp | 6 +- .../intel_npu/utils/zero}/zero_types.hpp | 2 - .../intel_npu/utils/zero/zero_utils.hpp | 31 +- .../intel_npu/utils/zero}/zero_wrappers.hpp | 23 +- .../intel_npu/src/utils/src/CMakeLists.txt | 3 +- .../src/utils/src/zero/CMakeLists.txt | 34 +- .../src => utils/src/zero}/zero_init.cpp | 9 +- .../src => utils/src/zero}/zero_wrappers.cpp | 28 +- .../intel_npu/tests/functional/CMakeLists.txt | 10 +- .../custom_stream.cpp | 5 +- .../ov_infer_request/compile_and_infer.cpp | 4 +- .../functional/behavior/work_with_devices.hpp | 2 +- .../internal/overload/compile_and_infer.hpp | 8 +- .../overload/compiled_model/property.cpp | 2 +- .../behavior/compiled_model/properties.cpp | 2 +- .../intel_npu/thirdparty/CMakeLists.txt | 3 +- 73 files changed, 2620 insertions(+), 2544 deletions(-) create mode 100644 src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp delete mode 100644 src/plugins/intel_npu/src/backend/include/zero_executor.hpp delete mode 100644 src/plugins/intel_npu/src/backend/src/zero_executor.cpp create mode 100644 src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp delete mode 100644 src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/CMakeLists.txt (85%) rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/include/custom_stream_buffer.hpp (95%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp rename src/plugins/intel_npu/src/{compiler/include/graph_transformations.hpp => compiler_adapter/include/ir_serializer.hpp} (93%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp rename src/plugins/intel_npu/src/{compiler/src/graph_transformations.cpp => compiler_adapter/src/ir_serializer.cpp} (94%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/src/precomp.hpp (100%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp delete mode 100644 src/plugins/intel_npu/src/plugin/include/compiler.hpp delete mode 100644 src/plugins/intel_npu/src/plugin/src/compiler.cpp rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_init.hpp (95%) rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_types.hpp (99%) rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_wrappers.hpp (90%) rename src/plugins/intel_npu/src/{backend/src => utils/src/zero}/zero_init.cpp (98%) rename src/plugins/intel_npu/src/{backend/src => utils/src/zero}/zero_wrappers.cpp (91%) diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md index b7508c68704e32..980faa71a15937 100644 --- a/src/plugins/intel_npu/README.md +++ b/src/plugins/intel_npu/README.md @@ -78,7 +78,7 @@ There is currently no support for multiple devices, which means only one level-z ### Inference pipeline -The result of the model compilation is represented through a NetworkDescription. This model description is passed by the plugin to the driver to create a level zero graph instance and obtain a graph handle that can later be used to execute multiple inferences in parallel for the same model. Since the same model instance is shared across all subsequent inference objects, this initialization step is performed by default right after the model is compiled and it can be postponed until the creation of the first inference request through the use of an environment variable: "IE_NPU_CREATE_EXECUTOR" (IE_NPU_CREATE_EXECUTOR=0 to postpone the initialization). +The result of the model compilation is represented through an IGraph object, which contains a valid level zero graph handle that can later be used to execute multiple inferences in parallel for the same model. By default, weights are loaded into the NPU memory right after the model is compiled, but this step can be postponed until the creation of the first inference request through the use of an internal NPU property: "NPU_DEFER_WEIGHTS_LOAD". Users can create one or more inference requests for a compiled model using OpenVINO API: diff --git a/src/plugins/intel_npu/cmake/features.cmake b/src/plugins/intel_npu/cmake/features.cmake index 0dde0f9d67f6e5..7d34c52c6d1292 100644 --- a/src/plugins/intel_npu/cmake/features.cmake +++ b/src/plugins/intel_npu/cmake/features.cmake @@ -4,29 +4,10 @@ ov_option(ENABLE_MLIR_COMPILER "Enable compilation of npu_mlir_compiler libraries" ON) -ov_option(ENABLE_NPU_RUNTIME_COMMON "Enable compilation of npu runtime common libraries" ON) +ov_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON) -# if ENABLE_ZEROAPI_BACKEND=ON, it adds the ze_loader dependency for driver compiler -ov_dependent_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON "ENABLE_NPU_RUNTIME_COMMON" OFF) - -ov_dependent_option(ENABLE_ZEROAPI_BACKEND "Enable zero-api as a plugin backend" ON "ENABLE_NPU_RUNTIME_COMMON;ENABLE_NPU_PLUGIN_ENGINE" OFF) - -ov_dependent_option(ENABLE_DRIVER_COMPILER_ADAPTER "Enable NPU Compiler inside driver" ON "ENABLE_ZEROAPI_BACKEND" OFF) - -if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_TESTS) - message(FATAL_ERROR "Tests depends on npu plugin engine and npu runtime common libraries!") -endif() - -if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_ZEROAPI_BACKEND) - message(FATAL_ERROR "Zero backend depends on npu plugin engine and npu common libraries!") -endif() - -if(NOT ENABLE_ZEROAPI_BACKEND AND ENABLE_DRIVER_COMPILER_ADAPTER) - message(FATAL_ERROR "Compiler adapter depends on zero backend to use same context!") -endif() - -if(NOT BUILD_SHARED_LIBS AND NOT ENABLE_MLIR_COMPILER AND NOT ENABLE_DRIVER_COMPILER_ADAPTER) - message(FATAL_ERROR "No compiler found for static build!") +if(NOT ENABLE_NPU_PLUGIN_ENGINE AND ENABLE_TESTS) + message(FATAL_ERROR "Tests depends on npu plugin engine!") endif() ov_dependent_option(ENABLE_IMD_BACKEND "Enable InferenceManagerDemo based NPU AL backend" OFF "NOT WIN32;NOT CMAKE_CROSSCOMPILING" OFF) diff --git a/src/plugins/intel_npu/src/CMakeLists.txt b/src/plugins/intel_npu/src/CMakeLists.txt index 5530eb1f3e59e5..f5d1fd5b41226c 100644 --- a/src/plugins/intel_npu/src/CMakeLists.txt +++ b/src/plugins/intel_npu/src/CMakeLists.txt @@ -9,18 +9,9 @@ add_subdirectory(utils) add_subdirectory(al) -if (ENABLE_NPU_RUNTIME_COMMON) +if (ENABLE_NPU_PLUGIN_ENGINE) add_subdirectory(common) -endif() - -if(ENABLE_DRIVER_COMPILER_ADAPTER AND ENABLE_ZEROAPI_BACKEND) - add_subdirectory(compiler) -endif() - -if(ENABLE_ZEROAPI_BACKEND) + add_subdirectory(compiler_adapter) add_subdirectory(backend) -endif() - -if (ENABLE_NPU_PLUGIN_ENGINE) add_subdirectory(plugin) endif() diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index d52c25f6a3e6a5..510ab7fc43b0c8 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -131,6 +131,38 @@ struct CREATE_EXECUTOR final : OptionBase { } }; +// +// DEFER_WEIGHTS_LOAD +// + +struct DEFER_WEIGHTS_LOAD final : OptionBase { + static std::string_view key() { + return ov::intel_npu::defer_weights_load.name(); + } + + static int64_t defaultValue() { + return false; + } + + static constexpr std::string_view getTypeName() { + return "bool"; + } + +#ifdef NPU_PLUGIN_DEVELOPER_BUILD + static std::string_view envVar() { + return "OV_NPU_DEFER_WEIGHTS_LOAD"; + } +#endif + + static bool isPublic() { + return false; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + // // NUM_STREAMS // diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp index e0a02f12aa2e17..53696396603d9a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp @@ -6,128 +6,12 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include - #include "intel_npu/config/config.hpp" -#include "openvino/core/partial_shape.hpp" -#include "openvino/core/type/element_type.hpp" -#include "openvino/runtime/common.hpp" +#include "intel_npu/network_metadata.hpp" #include "openvino/runtime/profiling_info.hpp" namespace intel_npu { -/** - * @brief A helper structure used for storing metadata corresponding to one input/output entry. - */ -struct IODescriptor { - /** - * @brief The name of the input/output assigned by the compiler. - * @details This value may differ from other name attributes: - * - The compiler could have created additional inputs/outputs (e.g. for representing states). These are not - * found in the original IR model. - * - The compiler may append indices to names in the case where duplicate names are found. - * @note The prefixes introduced by the compiler in order to differentiate the special cases (e.g. states and shape - * tensors) were removed prior to initializing this field. - */ - std::string nameFromCompiler; - - ov::element::Type precision; - - ov::PartialShape shapeFromCompiler; - - /** - * @brief If set to "true", the current object describes a buffer which may be used for altering a state tensor. - * @details This flag is set if the compiler prefixed the name using a "read value" prefix. The state input and - * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isStateInput = false; - - /** - * @brief If set to "true", the current object describes a buffer which reflects the value of a state tensor. - * @details This flag is set if the compiler prefixed the name using an "assign" prefix. The state input and - * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isStateOutput = false; - - /** - * @brief If set to "true", the buffer of the tensor described here contains as value the shape of the referenced - * tensor. - * @details This flag is set if the compiler prefixed the name using a "shape" prefix. - * - * The referenced tensor bears the same name ("nameFromCompiler"), but its "isShapeTensor" value is set to - * "false". The two descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isShapeTensor = false; - - /** - * @brief Points towards a related descriptor. - * @details The related descriptors are defined by (state input, state output) or (dynamic tensor, shape tensor) - * pairs. - */ - std::optional relatedDescriptorIndex; - - /** - * @brief The friendly name of the node extracted from the IR model. - * @details In some cases, this field is required for constructing a dummy model which uses the same input/output - * metadata as the original IR model. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the - * compiler). - */ - std::string nodeFriendlyName; - - /** - * @brief The names of the output tensors extracted from the IR model. - * @details In some cases, this field is required for constructing a dummy model which uses the same input/output - * metadata as the original IR model. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the - * compiler). - */ - std::unordered_set outputTensorNames; - - /** - * @brief The shape extracted from the IR model. - * @details The values may differ from the ones found in "shapeFromCompiler" if batching is to be handled by the - * plugin. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added - * by the compiler). - */ - std::optional shapeFromIRModel = std::nullopt; -}; - -struct NetworkMetadata final { - std::string name; - - std::vector inputs; - std::vector outputs; - std::vector profilingOutputs; - - size_t numStreams = 1; - - // Used primarily in the CID path to pass the level zero graph handle from compiler to the backend executor - void* graphHandle = nullptr; - - /** - * @brief Binds the (state input, state output) and (dynamic tensor, shape tensor) pairs using the - * "relatedDescriptorIndex" attribute. - * @details For state inputs, the "relatedDescriptorIndex" value is set to the index of the output which bears the - * same name. The reverse is also applied. - * - * For shape tensors, the lookup is performed in the same container (inputs or outputs). The value is once again set - * to the index of the entry which bears the same name. - */ - void bindRelatedDescriptors(); - -}; // namespace intel_npu - /** * @struct NetworkDescription * @brief The object returned by the compiler @@ -138,7 +22,6 @@ struct NetworkDescription final { NetworkDescription(std::vector&& compiledNetwork, NetworkMetadata&& metadata) : compiledNetwork(std::move(compiledNetwork)), metadata(std::move(metadata)) {} - NetworkDescription(NetworkMetadata&& metadata) : metadata(std::move(metadata)) {} // Force move semantics to prevent blob copies NetworkDescription(const NetworkDescription&) = delete; NetworkDescription(NetworkDescription&&) = default; @@ -151,32 +34,6 @@ struct NetworkDescription final { NetworkMetadata metadata; }; -/** - * @struct CompiledNetwork - * @brief Custom container for compiled network, used for export - * @var CompiledNetwork::data - * Pointer to the address of compiled network - * @var CompiledNetwork:size - * Size of the compiled network - * @var CompiledNetwork::ownedStorage - * Plugin owned compiled network storage that is required in case of a driver that - * doesn't support graph extension 1.7, as in this case plugin must create a copy of the compiled network. - * @note It's unsafe to store either data or size outside of the compiled network object as its destructor - * would release the owning container - */ - -struct CompiledNetwork { - const uint8_t* data; - size_t size; - CompiledNetwork(const uint8_t* data, size_t size, std::vector storage) - : data(data), - size(size), - ownedStorage(std::move(storage)) {} - -private: - std::vector ownedStorage; -}; - /** * @interface ICompiler * @brief An interface to be implemented by a concrete compiler to provide @@ -184,12 +41,6 @@ struct CompiledNetwork { */ class ICompiler : public std::enable_shared_from_this { public: - /** - * @brief Returns the maximum OpenVino opset version supported by the compiler - * @return opset version e.g. 11 for opset11 - */ - virtual uint32_t getSupportedOpsetVersion() const = 0; - /** * @brief Transforms a network from the OpenVINO model representation to a format executable * by a NPU device @@ -216,8 +67,6 @@ class ICompiler : public std::enable_shared_from_this { * @param config a reference to NPUConfig containing plugin config options * Note: compilation options will be ignored, * since the network is already compiled - * @param netName a reference to the string describing network name - * to be used for creating network description * @return a shared pointer on an object implementing NetworkDescription interface */ virtual NetworkMetadata parse(const std::vector& network, const Config& config) const = 0; @@ -226,15 +75,6 @@ class ICompiler : public std::enable_shared_from_this { const std::vector& network, const Config& config) const = 0; - // Driver compiler can use this to release graphHandle, if we do not have executor - virtual void release([[maybe_unused]] std::shared_ptr networkDescription){}; - - virtual CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) { - return CompiledNetwork(networkDescription.compiledNetwork.data(), - networkDescription.compiledNetwork.size(), - networkDescription.compiledNetwork); - } - protected: virtual ~ICompiler() = default; }; diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp new file mode 100644 index 00000000000000..b7a78b3dfd43e1 --- /dev/null +++ b/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp @@ -0,0 +1,127 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// Compiler Interface + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "intel_npu/config/config.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/common.hpp" + +namespace intel_npu { + +/** + * @brief A helper structure used for storing metadata corresponding to one input/output entry. + */ +struct IODescriptor { + /** + * @brief The name of the input/output assigned by the compiler. + * @details This value may differ from other name attributes: + * - The compiler could have created additional inputs/outputs (e.g. for representing states). These are not + * found in the original IR model. + * - The compiler may append indices to names in the case where duplicate names are found. + * @note The prefixes introduced by the compiler in order to differentiate the special cases (e.g. states and shape + * tensors) were removed prior to initializing this field. + */ + std::string nameFromCompiler; + + ov::element::Type precision; + + ov::PartialShape shapeFromCompiler; + + /** + * @brief If set to "true", the current object describes a buffer which may be used for altering a state tensor. + * @details This flag is set if the compiler prefixed the name using a "read value" prefix. The state input and + * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isStateInput = false; + + /** + * @brief If set to "true", the current object describes a buffer which reflects the value of a state tensor. + * @details This flag is set if the compiler prefixed the name using an "assign" prefix. The state input and + * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isStateOutput = false; + + /** + * @brief If set to "true", the buffer of the tensor described here contains as value the shape of the referenced + * tensor. + * @details This flag is set if the compiler prefixed the name using a "shape" prefix. + * + * The referenced tensor bears the same name ("nameFromCompiler"), but its "isShapeTensor" value is set to + * "false". The two descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isShapeTensor = false; + + /** + * @brief Points towards a related descriptor. + * @details The related descriptors are defined by (state input, state output) or (dynamic tensor, shape tensor) + * pairs. + */ + std::optional relatedDescriptorIndex; + + /** + * @brief The friendly name of the node extracted from the IR model. + * @details In some cases, this field is required for constructing a dummy model which uses the same input/output + * metadata as the original IR model. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the + * compiler). + */ + std::string nodeFriendlyName; + + /** + * @brief The names of the output tensors extracted from the IR model. + * @details In some cases, this field is required for constructing a dummy model which uses the same input/output + * metadata as the original IR model. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the + * compiler). + */ + std::unordered_set outputTensorNames; + + /** + * @brief The shape extracted from the IR model. + * @details The values may differ from the ones found in "shapeFromCompiler" if batching is to be handled by the + * plugin. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added + * by the compiler). + */ + std::optional shapeFromIRModel = std::nullopt; +}; + +struct NetworkMetadata final { + std::string name; + + std::vector inputs; + std::vector outputs; + std::vector profilingOutputs; + + size_t numStreams = 1; + + /** + * @brief Binds the (state input, state output) and (dynamic tensor, shape tensor) pairs using the + * "relatedDescriptorIndex" attribute. + * @details For state inputs, the "relatedDescriptorIndex" value is set to the index of the output which bears the + * same name. The reverse is also applied. + * + * For shape tensors, the lookup is performed in the same container (inputs or outputs). The value is once again set + * to the index of the entry which bears the same name. + */ + void bindRelatedDescriptors(); + +}; // namespace intel_npu + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index 0c5a04ce0c0d83..d8fabee177b2b9 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -351,6 +351,13 @@ static constexpr ov::Property batch_mode{"NPU_BATCH_MODE"}; */ static constexpr ov::Property create_executor{"NPU_CREATE_EXECUTOR"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false + * This option allows to omit loading the weights until inference is created + */ +static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; + /** * @brief Read-only property to get the name of used backend */ diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp index 10f9b4a7c7222b..759956b6f597df 100644 --- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp +++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp @@ -21,6 +21,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/backend/CMakeLists.txt b/src/plugins/intel_npu/src/backend/CMakeLists.txt index 01465a8179dc24..5a1585c0a63073 100644 --- a/src/plugins/intel_npu/src/backend/CMakeLists.txt +++ b/src/plugins/intel_npu/src/backend/CMakeLists.txt @@ -25,7 +25,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::npu_al openvino::npu_common - openvino_npu_zero_result_parser ze_loader ) @@ -33,31 +32,3 @@ target_link_libraries(${TARGET_NAME} # targets install # ov_install_static_lib(${TARGET_NAME} ${NPU_INTERNAL_COMPONENT}) - -if(TARGET ze_loader) - if(NOT BUILD_SHARED_LIBS) - # Support link of static runtime in case system does not have ze_loader - install(TARGETS ze_loader EXPORT OpenVINOTargets - RUNTIME DESTINATION ${OV_CPACK_RUNTIMEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - ARCHIVE DESTINATION ${OV_CPACK_ARCHIVEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - LIBRARY DESTINATION ${OV_CPACK_LIBRARYDIR} COMPONENT ${NPU_PLUGIN_COMPONENT}) - - install(TARGETS utils EXPORT OpenVINOTargets - RUNTIME DESTINATION ${OV_CPACK_RUNTIMEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - ARCHIVE DESTINATION ${OV_CPACK_ARCHIVEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - LIBRARY DESTINATION ${OV_CPACK_LIBRARYDIR} COMPONENT ${NPU_PLUGIN_COMPONENT}) - - # export to local tree to build against static build tree - export(TARGETS ze_loader NAMESPACE openvino:: - APPEND FILE "${CMAKE_BINARY_DIR}/OpenVINOTargets.cmake") - - export(TARGETS utils NAMESPACE openvino:: - APPEND FILE "${CMAKE_BINARY_DIR}/OpenVINOTargets.cmake") - endif() - - # Support tests to run with ze_loader - install(TARGETS ze_loader - RUNTIME DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL - LIBRARY DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL) -endif() - diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp index 68e4f9434418a6..038c7c1d2d9bf9 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp @@ -9,7 +9,7 @@ #include "intel_npu/common/npu.hpp" #include "intel_npu/utils/logger/logger.hpp" -#include "zero_init.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" namespace intel_npu { class ZeroEngineBackend final : public IEngineBackend { @@ -29,15 +29,14 @@ class ZeroEngineBackend final : public IEngineBackend { bool isCommandQueueExtSupported() const override; bool isLUIDExtSupported() const override; + const std::shared_ptr& getInitStruct() const; + void* getContext() const override; - void* getDriverHandle() const; - void* getDeviceHandle() const; - ze_graph_dditable_ext_curr_t& getGraphDdiTable() const; void updateInfo(const Config& config) override; private: - std::shared_ptr _instance; + std::shared_ptr _initStruct; std::map> _devices{}; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp index e87a602613a92a..50f0d28ed210cd 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_device.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_device.hpp @@ -10,9 +10,9 @@ #include "intel_npu/common/icompiled_model.hpp" #include "intel_npu/common/npu.hpp" #include "intel_npu/utils/logger/logger.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" -#include "zero_init.hpp" -#include "zero_types.hpp" namespace intel_npu { @@ -20,9 +20,6 @@ class ZeroDevice : public IDevice { public: ZeroDevice(const std::shared_ptr& initStructs); - std::shared_ptr createExecutor(const std::shared_ptr& networkDescription, - const Config& config) override; - std::string getName() const override; std::string getFullDeviceName() const override; Uuid getUuid() const override; @@ -36,7 +33,6 @@ class ZeroDevice : public IDevice { ov::device::Type getDeviceType() const override; std::shared_ptr createInferRequest(const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) override; void updateInfo(const Config& config) override { log.setLevel(config.get()); @@ -76,8 +72,6 @@ class ZeroDevice : public IDevice { {ov::element::u8, 0.f}, {ov::element::i8, 0.f}}; - uint32_t _group_ordinal; - Logger log; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_executor.hpp b/src/plugins/intel_npu/src/backend/include/zero_executor.hpp deleted file mode 100644 index eeb96defc16441..00000000000000 --- a/src/plugins/intel_npu/src/backend/include/zero_executor.hpp +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -#include - -#include "intel_npu/common/npu.hpp" -#include "intel_npu/utils/logger/logger.hpp" -#include "openvino/runtime/properties.hpp" -#include "zero_init.hpp" -#include "zero_wrappers.hpp" - -namespace intel_npu { - -class ZeroExecutor final : public IExecutor { -public: - ZeroExecutor(const std::shared_ptr& initStructs, - const std::shared_ptr& networkDescription, - const Config& config, - const uint32_t& group_ordinal); - - ZeroExecutor(const ZeroExecutor&) = delete; - ZeroExecutor& operator=(const ZeroExecutor&) = delete; - - ~ZeroExecutor() override; - - struct ArgumentDescriptor { - ze_graph_argument_properties_3_t info; - uint32_t idx; - }; - - void setArgumentValue(uint32_t argi_, const void* argv_) const; - void setWorkloadType(const ov::WorkloadType workloadType) const override; - void mutexLock() const; - void mutexUnlock() const; - inline ze_graph_handle_t graph() const { - return _graph; - } - inline std::shared_ptr getInitStructs() const { - return _initStructs; - } - inline const std::shared_ptr& getNetworkDesc() const { - return _networkDesc; - } - inline const std::shared_ptr& getCommandQueue() const { - return _command_queues; - } - inline const uint32_t& get_group_ordinal() const { - return _group_ordinal; - } - inline const std::vector& get_input_descriptors() const { - return _input_descriptors; - } - inline const std::vector& get_output_descriptors() const { - return _output_descriptors; - } - -private: - void initialize_graph_through_command_list() const; - - const Config _config; - Logger _logger; - - const std::shared_ptr _initStructs; - std::shared_ptr _networkDesc; - - ze_graph_dditable_ext_curr_t& _graph_ddi_table_ext; - - const uint32_t _group_ordinal; - - ze_graph_handle_t _graph = nullptr; - - std::vector _input_descriptors; - std::vector _output_descriptors; - - std::shared_ptr _command_queues; - - mutable std::mutex _mutex; -}; - -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp index 52000930e2a751..a214c8e2cb2b5d 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp @@ -5,8 +5,8 @@ #pragma once #include "intel_npu/config/config.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/itensor.hpp" -#include "zero_init.hpp" #include "zero_remote_tensor.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 48aad52010a4c2..31248b582250da 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -11,19 +11,17 @@ #include "intel_npu/common/sync_infer_request.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" #include "zero_pipeline.hpp" #include "zero_profiling.hpp" #include "zero_remote_tensor.hpp" -#include "zero_wrappers.hpp" namespace intel_npu { class ZeroInferRequest final : public SyncInferRequest { public: - explicit ZeroInferRequest(const std::shared_ptr& backendPtr, + explicit ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config); ov::SoPtr get_tensor(const ov::Output& port) const override; @@ -85,8 +83,7 @@ class ZeroInferRequest final : public SyncInferRequest { std::vector>& get_input_tensors_data(size_t index) const; const std::shared_ptr _initStructs; - const std::shared_ptr _executorPtr; - const ZeroExecutor* _executor; + const std::shared_ptr _graph; const Config _config; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp index 6ecbde0d546110..992f409b86a928 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp @@ -11,7 +11,7 @@ #include #include "intel_npu/utils/logger/logger.hpp" -#include "zero_init.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" namespace { diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 62c8481d28ac1a..92a473a9fc412c 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -4,11 +4,11 @@ #pragma once +#include "intel_npu/common/igraph.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" #include "zero_memory.hpp" #include "zero_profiling.hpp" -#include "zero_wrappers.hpp" namespace intel_npu { @@ -21,13 +21,15 @@ struct TensorData { struct Pipeline { public: Pipeline(const Config& config, - const std::shared_ptr& executorPtr, + const std::shared_ptr& initStructs, + const std::shared_ptr& graph, zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - const size_t numberOfCommandLists); + size_t numberOfCommandLists, + uint32_t group_ordinal); Pipeline(const Pipeline&) = delete; Pipeline& operator=(const Pipeline&) = delete; @@ -42,8 +44,7 @@ struct Pipeline { protected: const Config _config; - const ZeroExecutor* _executor; - CommandQueue& _command_queue; + std::shared_ptr _command_queue; std::vector> _command_lists; std::vector> _fences; EventPool _event_pool; diff --git a/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp b/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp index 505a7f0185e135..17e263a7aaf620 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp @@ -12,8 +12,8 @@ #include "intel_npu/config/compiler.hpp" #include "intel_npu/utils/logger/logger.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" #include "openvino/runtime/profiling_info.hpp" -#include "zero_types.hpp" namespace intel_npu { namespace zeroProfiling { diff --git a/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp index 0211bd5bd08962..5b08643704b651 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp @@ -9,8 +9,8 @@ #include #include "intel_npu/common/remote_tensor.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" -#include "zero_init.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp index 86af62d414b88c..55aaad102e8b8f 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp @@ -14,31 +14,31 @@ namespace intel_npu { ZeroEngineBackend::ZeroEngineBackend(const Config& config) : _logger("ZeroEngineBackend", Logger::global().level()) { _logger.debug("ZeroEngineBackend - initialize started"); - _instance = std::make_shared(); + _initStruct = std::make_shared(); - auto device = std::make_shared(_instance); + auto device = std::make_shared(_initStruct); _devices.emplace(std::make_pair(device->getName(), device)); _logger.debug("ZeroEngineBackend - initialize completed"); } uint32_t ZeroEngineBackend::getDriverVersion() const { - return _instance->getDriverVersion(); + return _initStruct->getDriverVersion(); } uint32_t ZeroEngineBackend::getGraphExtVersion() const { - return _instance->getGraphDdiTable().version(); + return _initStruct->getGraphDdiTable().version(); } bool ZeroEngineBackend::isBatchingSupported() const { - return _instance->isExtensionSupported("ZE_extension_graph_1_6", ZE_MAKE_VERSION(1, 6)); + return _initStruct->isExtensionSupported("ZE_extension_graph_1_6", ZE_MAKE_VERSION(1, 6)); } bool ZeroEngineBackend::isCommandQueueExtSupported() const { - return _instance->isExtensionSupported(std::string(ZE_COMMAND_QUEUE_NPU_EXT_NAME), ZE_MAKE_VERSION(1, 0)); + return _initStruct->isExtensionSupported(std::string(ZE_COMMAND_QUEUE_NPU_EXT_NAME), ZE_MAKE_VERSION(1, 0)); } bool ZeroEngineBackend::isLUIDExtSupported() const { - return _instance->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0)); + return _initStruct->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0)); } ZeroEngineBackend::~ZeroEngineBackend() = default; @@ -69,19 +69,11 @@ const std::vector ZeroEngineBackend::getDeviceNames() const { } void* ZeroEngineBackend::getContext() const { - return _instance->getContext(); + return _initStruct->getContext(); } -void* ZeroEngineBackend::getDriverHandle() const { - return _instance->getDriver(); -} - -void* ZeroEngineBackend::getDeviceHandle() const { - return _instance->getDevice(); -} - -ze_graph_dditable_ext_curr_t& ZeroEngineBackend::getGraphDdiTable() const { - return _instance->getGraphDdiTable(); +const std::shared_ptr& ZeroEngineBackend::getInitStruct() const { + return _initStruct; } void ZeroEngineBackend::updateInfo(const Config& config) { diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 58bcd0eb7cc944..6e16dde3b120bf 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -7,7 +7,6 @@ #include "intel_npu/common/itt.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" #include "zero_host_tensor.hpp" #include "zero_infer_request.hpp" #include "zero_remote_tensor.hpp" @@ -64,38 +63,6 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs device_gops[ov::element::i8] = gops; device_gops[ov::element::f16] = 0.5f * gops; } - - std::vector command_group_properties; - uint32_t command_queue_group_count = 0; - // Discover all command queue groups - THROW_ON_FAIL_FOR_LEVELZERO( - "zeDeviceGetCommandQueueGroupProperties", - zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), &command_queue_group_count, nullptr)); - - log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count"); - command_group_properties.resize(command_queue_group_count); - - for (auto& prop : command_group_properties) { - prop.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; - prop.pNext = nullptr; - } - - THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetCommandQueueGroupProperties", - zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), - &command_queue_group_count, - command_group_properties.data())); - - // Find the corresponding command queue group. - log.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); - _group_ordinal = zeroUtils::findGroupOrdinal(command_group_properties, device_properties); - log.debug("ZeroDevice::ZeroDevice - init completed"); -} - -std::shared_ptr ZeroDevice::createExecutor( - const std::shared_ptr& networkDescription, - const Config& config) { - OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Device::createExecutor"); - return std::make_shared(_initStructs, networkDescription, config, _group_ordinal); } std::string ZeroDevice::getName() const { @@ -205,9 +172,8 @@ ov::device::Type ZeroDevice::getDeviceType() const { std::shared_ptr ZeroDevice::createInferRequest( const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) { - return std::make_shared(_initStructs, compiledModel, executor, config); + return std::make_shared(_initStructs, compiledModel, config); } ov::SoPtr ZeroDevice::createRemoteTensor(std::shared_ptr context, diff --git a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp deleted file mode 100644 index 32da2b2e0e4189..00000000000000 --- a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "zero_executor.hpp" - -#include - -#include -#include -#include -#include - -#include "intel_npu/common/itt.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/prefix.hpp" -#include "intel_npu/utils/zero/zero_utils.hpp" -#include "openvino/runtime/properties.hpp" -#include "ze_command_queue_npu_ext.h" -#include "zero_device.hpp" - -using namespace intel_npu; - -ZeroExecutor::ZeroExecutor(const std::shared_ptr& initStructs, - const std::shared_ptr& networkDescription, - const Config& config, - const uint32_t& group_ordinal) - : _config(config), - _logger("Graph", _config.get()), - _initStructs(initStructs), - _networkDesc(networkDescription), - _graph_ddi_table_ext(_initStructs->getGraphDdiTable()), - _group_ordinal(group_ordinal), - _command_queues{std::make_shared(_initStructs->getDevice(), - _initStructs->getContext(), - zeroUtils::toZeQueuePriority(_config.get()), - _initStructs->getCommandQueueDdiTable(), - _config, - group_ordinal)} { - _logger.debug("ZeroExecutor::ZeroExecutor - create graph"); - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, itt::domains::LevelZeroBackend, "Executor::ZeroExecutor", "graphCreate"); - - // _graph is a nullptr for CIP path, a new handle will be obtained from the driver based on the given - // compiledNetwork _graph gets (reuses) graphHandle from the compiler for CID path - if (_networkDesc->metadata.graphHandle == nullptr) { - _logger.debug("create graph handle on executor"); - ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NATIVE, - _networkDesc->compiledNetwork.size(), - _networkDesc->compiledNetwork.data(), - nullptr}; - ze_result_t result = - _graph_ddi_table_ext.pfnCreate(_initStructs->getContext(), _initStructs->getDevice(), &desc, &_graph); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _graph_ddi_table_ext); - - } else { - _logger.debug("reuse graph handle created from compiler"); - _graph = static_cast(_networkDesc->metadata.graphHandle); - } - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetProperties"); - _logger.debug("performing pfnGetProperties"); - ze_graph_properties_t props{}; - props.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES; - - ze_result_t result = _graph_ddi_table_ext.pfnGetProperties(_graph, &props); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetProperties", result, _graph_ddi_table_ext); - - auto targetDriverExtVersion = _graph_ddi_table_ext.version(); - if (targetDriverExtVersion <= ZE_GRAPH_EXT_VERSION_1_1) { - OPENVINO_THROW("Incompatibility between the NPU plugin and driver! The driver version is too old, please " - "update the driver version"); - } - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetArgumentProperties3"); - _logger.debug("performing pfnGetArgumentProperties3"); - for (uint32_t index = 0; index < props.numGraphArgs; ++index) { - ze_graph_argument_properties_3_t arg3{}; - arg3.stype = ZE_STRUCTURE_TYPE_GRAPH_ARGUMENT_PROPERTIES; - ze_result_t result = _graph_ddi_table_ext.pfnGetArgumentProperties3(_graph, index, &arg3); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _graph_ddi_table_ext); - - if (arg3.type == ZE_GRAPH_ARGUMENT_TYPE_INPUT) { - _input_descriptors.push_back(ArgumentDescriptor{arg3, index}); - } else { - _output_descriptors.push_back(ArgumentDescriptor{arg3, index}); - } - } - - if (_graph_ddi_table_ext.version() < ZE_GRAPH_EXT_VERSION_1_8) { - initialize_graph_through_command_list(); - } else { - ze_graph_properties_2_t properties = {}; - properties.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES; - _graph_ddi_table_ext.pfnGetProperties2(_graph, &properties); - - if (properties.initStageRequired & ZE_GRAPH_STAGE_INITIALIZE) { - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGraphInitialize"); - _graph_ddi_table_ext.pfnGraphInitialize(_graph); - } - - if (properties.initStageRequired & ZE_GRAPH_STAGE_COMMAND_LIST_INITIALIZE) { - initialize_graph_through_command_list(); - } - } - - if (config.has()) { - setWorkloadType(config.get()); - } -} - -void ZeroExecutor::initialize_graph_through_command_list() const { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, - itt::domains::LevelZeroBackend, - "Executor::ZeroExecutor", - "initialize_graph_through_command_list"); - - _logger.debug("ZeroExecutor::ZeroExecutor init start - create graph_command_list"); - OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Executor::ZeroExecutor"); - CommandList graph_command_list(_initStructs->getDevice(), - _initStructs->getContext(), - _graph_ddi_table_ext, - _config, - _group_ordinal); - _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_queue"); - CommandQueue graph_command_queue(_initStructs->getDevice(), - _initStructs->getContext(), - ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - _initStructs->getCommandQueueDdiTable(), - _config, - _group_ordinal); - _logger.debug("ZeroExecutor::ZeroExecutor - create fence"); - Fence fence(graph_command_queue, _config); - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "appendGraphInitialize"); - _logger.debug("ZeroExecutor::ZeroExecutor - performing appendGraphInitialize"); - graph_command_list.appendGraphInitialize(_graph); - _logger.debug("ZeroExecutor::ZeroExecutor - closing graph command list"); - graph_command_list.close(); - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "queue_execute"); - _logger.debug("ZeroExecutor::ZeroExecutor - performing executeCommandList"); - graph_command_queue.executeCommandList(graph_command_list, fence); - _logger.debug("ZeroExecutor::ZeroExecutor - performing hostSynchronize"); - fence.hostSynchronize(); - _logger.debug("ZeroExecutor::ZeroExecutor - hostSynchronize completed"); -} - -void ZeroExecutor::setWorkloadType(const ov::WorkloadType workloadType) const { - ze_command_queue_workload_type_t zeWorkloadType; - switch (workloadType) { - case ov::WorkloadType::DEFAULT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; - break; - case ov::WorkloadType::EFFICIENT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; - break; - default: - OPENVINO_THROW("Unknown value for WorkloadType!"); - } - - _command_queues->setWorkloadType(zeWorkloadType); -} - -void ZeroExecutor::setArgumentValue(uint32_t argi_, const void* argv_) const { - ze_result_t result = _graph_ddi_table_ext.pfnSetArgumentValue(_graph, argi_, argv_); - if (ZE_RESULT_SUCCESS != result) { - THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _graph_ddi_table_ext); - } -} - -void ZeroExecutor::mutexLock() const { - _mutex.lock(); -} - -void ZeroExecutor::mutexUnlock() const { - _mutex.unlock(); -} - -ZeroExecutor::~ZeroExecutor() { - _logger.debug("~ZeroExecutor() - pfnDestroy _graph "); - auto result = _graph_ddi_table_ext.pfnDestroy(_graph); - if (ZE_RESULT_SUCCESS != result) { - _logger.error("_graph_ddi_table_ext.pfnDestroy failed %#X", uint64_t(result)); - } -} diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index dd2629372dc7d8..1c5ceecfac1961 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -31,8 +31,7 @@ constexpr bool OUTPUT = false; * @param ioDescriptor The OpenVINO API specific I/O descriptor which shall be compared. * @param zeDescriptor The Level Zero specific structure used for comparison. */ -void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, - const ZeroExecutor::ArgumentDescriptor& zeDescriptor) { +void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, const ArgumentDescriptor& zeDescriptor) { std::string zeDescriptorName = zeDescriptor.info.name; if (isStateInputName(zeDescriptorName)) { @@ -158,38 +157,35 @@ std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& me //------------------------------------------------------------------------------ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) : SyncInferRequest(compiledModel, config), _initStructs(initStructs), - _executorPtr(executor), - _executor(static_cast(_executorPtr.get())), + _graph(compiledModel->get_graph()), _config(config), _logger("ZeroInferRequest", config.get()), _levelZeroInputTensors(_metadata.inputs.size(), std::vector>(1, nullptr)), _levelZeroOutputTensors(_metadata.outputs.size(), nullptr), _inputTensorsData(_metadata.inputs.size(), std::vector>(1, std::nullopt)), _outputTensorsData(_metadata.outputs.size(), std::nullopt), - _profilingPool(_executor->graph(), zeroProfiling::POOL_SIZE, _executor->getInitStructs()->getProfilingDdiTable()), - _profilingQuery(0, - _executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getProfilingDdiTable()) { + _profilingPool(static_cast(_graph->get_handle()), + zeroProfiling::POOL_SIZE, + _initStructs->getProfilingDdiTable()), + _profilingQuery(0, _initStructs->getDevice(), _initStructs->getProfilingDdiTable()) { _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); - const std::vector& executorInputDescriptors = _executor->get_input_descriptors(); - const std::vector& executorOutputDescriptors = - _executor->get_output_descriptors(); + const std::vector& executorInputDescriptors = _graph->get_input_descriptors(); + const std::vector& executorOutputDescriptors = _graph->get_output_descriptors(); auto proftype = config.get(); if (proftype == ov::intel_npu::ProfilingType::INFER) { _logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER"); - _npuProfiling = std::make_shared(_executor->getInitStructs()->getContext(), - _executor->getInitStructs()->getDevice(), + _npuProfiling = std::make_shared(_initStructs->getContext(), + _initStructs->getDevice(), _config.get()); } _properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", - zeDeviceGetProperties(_executor->getInitStructs()->getDevice(), &_properties)); + zeDeviceGetProperties(_initStructs->getDevice(), &_properties)); _outputAllocator = std::make_shared(_initStructs); _inputAllocator = @@ -278,17 +274,24 @@ void ZeroInferRequest::create_pipeline() { _levelZeroOutputTensors.at(outputIndex)->get_byte_size()}); } + // Find the corresponding command queue group. + _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); + auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties); + _logger.debug("ZeroDevice::ZeroDevice - init completed"); + _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline"); - // Construct pipeline + // Construct pipeline _pipeline = std::make_unique(_config, - _executorPtr, + _initStructs, + _graph, _profilingPool, _profilingQuery, _npuProfiling, _inputTensorsData, _outputTensorsData, - _numberOfCommandLists); + _numberOfCommandLists, + groupOrdinal); _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed"); } @@ -338,8 +341,8 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr tensor OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList"); _pipeline->updateCommandList(*tensorsData, - isInput ? _executor->get_input_descriptors().at(index).idx - : _executor->get_output_descriptors().at(index).idx); + isInput ? _graph->get_input_descriptors().at(index).idx + : _graph->get_output_descriptors().at(index).idx); } } } @@ -370,9 +373,9 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptrupdateCommandList(*tensorsData, - isInput ? _executor->get_input_descriptors().at(index).idx - : _executor->get_output_descriptors().at(index).idx); + _pipeline->updateCommandList( + *tensorsData, + isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx); } } @@ -390,13 +393,17 @@ void ZeroInferRequest::set_tensor(const ov::Output& port, const if (foundPort.is_input()) { if (get_user_input(foundPort.idx)._ptr == tensor._ptr) { // Got set_tensor with the same object - do nothing + _logger.debug("ZeroInferRequest::set_tensor - got the same tensor, do nothing"); return; } if (is_batched_input(foundPort.idx)) { // resize vector size to 1 if set_tensor is called after set_tensors get_input_tensors_data(foundPort.idx).resize(1); + get_input_tensors_data(foundPort.idx).shrink_to_fit(); get_level_zero_inputs(foundPort.idx).resize(1); + get_level_zero_inputs(foundPort.idx).shrink_to_fit(); get_user_inputs(foundPort.idx).resize(1); + get_user_inputs(foundPort.idx).shrink_to_fit(); } get_user_input(foundPort.idx) = tensor; @@ -485,7 +492,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port, if (_pipelineIsCreated) { OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList"); _pipeline->updateCommandList(*get_input_tensor_data(foundPort.idx, i), - _executor->get_input_descriptors().at(foundPort.idx).idx, + _graph->get_input_descriptors().at(foundPort.idx).idx, i); } } @@ -537,14 +544,16 @@ void ZeroInferRequest::infer_async() { _logger.debug("InferRequest::infer_async started"); OV_ITT_TASK_CHAIN(ZERO_INFER, itt::domains::LevelZeroBackend, "infer_async", "start"); - _executor->mutexLock(); - if (!_pipelineIsCreated) { - OV_ITT_TASK_NEXT(ZERO_INFER, "create_pipeline"); - create_pipeline(); + { + std::lock_guard lock(_graph->get_mutex()); - _pipelineIsCreated = true; + if (!_pipelineIsCreated) { + OV_ITT_TASK_NEXT(ZERO_INFER, "create_pipeline"); + create_pipeline(); + + _pipelineIsCreated = true; + } } - _executor->mutexUnlock(); size_t inputIndex = 0; for (const auto& userTensor : _userInputTensors) { @@ -740,12 +749,9 @@ std::vector ZeroInferRequest::get_profiling_info() const { if (compilerType == ov::intel_npu::CompilerType::MLIR) { // For plugin compiler retreive raw profiling data from backend and delegate // processing to the compiler - const auto& networkDesc = compiledModel.get_network_description(); - const auto& compiler = compiledModel.get_compiler(); - const auto& blob = networkDesc->compiledNetwork; auto profData = get_raw_profiling_data(); _logger.debug("InferRequest::get_profiling_info complete with compiler->process_profiling_output()."); - return compiler->process_profiling_output(profData, blob, compilerConfig); + return _graph->process_profiling_output(profData, compilerConfig); } else { auto proftype = _config.get(); if (proftype == ov::intel_npu::ProfilingType::INFER) { diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 009eee6541e8ef..34eb71eaf112f7 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -11,25 +11,25 @@ #include "intel_npu/prefix.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" -#include "zero_types.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" namespace intel_npu { Pipeline::Pipeline(const Config& config, - const std::shared_ptr& executorPtr, + const std::shared_ptr& initStructs, + const std::shared_ptr& graph, zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - const size_t numberOfCommandLists) + size_t numberOfCommandLists, + uint32_t group_ordinal) : _config(config), - _executor(static_cast(executorPtr.get())), - _command_queue(*_executor->getCommandQueue()), - _event_pool{_executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getContext(), - numberOfCommandLists ? static_cast(numberOfCommandLists) : 1, - _config}, + _command_queue(graph->get_command_queue()), + _event_pool{initStructs->getDevice(), + initStructs->getContext(), + numberOfCommandLists ? static_cast(numberOfCommandLists) : 1}, _npu_profiling(std::move(npu_profiling)), _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); @@ -45,38 +45,37 @@ Pipeline::Pipeline(const Config& config, _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); for (size_t i = 0; i < numberOfCommandLists; i++) { _command_lists.emplace_back( - std::make_unique(_executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getContext(), - _executor->getInitStructs()->getGraphDdiTable(), - _config, - _executor->get_group_ordinal(), - _executor->getInitStructs()->getMutableCommandListVersion() ? true : false)); - _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i), _config)); - _fences.emplace_back(std::make_unique(_command_queue, _config)); + std::make_unique(initStructs->getDevice(), + initStructs->getContext(), + initStructs->getGraphDdiTable(), + group_ordinal, + initStructs->getMutableCommandListVersion() ? true : false)); + _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i))); + _fences.emplace_back(std::make_unique(*_command_queue)); } for (size_t i = 0; i < numberOfCommandLists; i++) { size_t ioIndex = 0; - for (const auto& desc : _executor->get_input_descriptors()) { + for (const auto& desc : graph->get_input_descriptors()) { if (inputTensorsData.at(ioIndex).size() > 1) { - _executor->setArgumentValue(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem); + graph->set_argument_value(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem); ++ioIndex; continue; } - _executor->setArgumentValue(desc.idx, - static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + - (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); + graph->set_argument_value(desc.idx, + static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + + (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); ++ioIndex; } ioIndex = 0; - for (const auto& desc : _executor->get_output_descriptors()) { - _executor->setArgumentValue(desc.idx, - static_cast(outputTensorsData.at(ioIndex)->mem) + - (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); + for (const auto& desc : graph->get_output_descriptors()) { + graph->set_argument_value(desc.idx, + static_cast(outputTensorsData.at(ioIndex)->mem) + + (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); ++ioIndex; } @@ -86,7 +85,8 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_start)); } - _command_lists.at(i)->appendGraphExecute(_executor->graph(), profiling_query.getHandle()); + _command_lists.at(i)->appendGraphExecute(static_cast(graph->get_handle()), + profiling_query.getHandle()); /// append timestamp command if feature was activated if (_npu_profiling != nullptr) { @@ -108,11 +108,11 @@ void Pipeline::push() { _logger.debug("Pipeline - push() started"); for (size_t i = 0; i < _command_lists.size(); ++i) { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); + OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); if (sync_output_with_fences_) { - _command_queue.executeCommandList(*_command_lists.at(i), *_fences.at(i)); + _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { - _command_queue.executeCommandList(*_command_lists.at(i)); + _command_queue->executeCommandList(*_command_lists.at(i)); } } @@ -121,7 +121,7 @@ void Pipeline::push() { void Pipeline::pull() { _logger.debug("Pipeline - pull() started"); - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); + OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); for (size_t i = 0; i < _command_lists.size(); ++i) { if (sync_output_with_fences_) { diff --git a/src/plugins/intel_npu/src/common/CMakeLists.txt b/src/plugins/intel_npu/src/common/CMakeLists.txt index 2d1f5d9cbb39ea..1aa93cce1bc291 100644 --- a/src/plugins/intel_npu/src/common/CMakeLists.txt +++ b/src/plugins/intel_npu/src/common/CMakeLists.txt @@ -20,7 +20,7 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::npu_al openvino::npu_logger_utils - openvino::runtime::dev + openvino::npu_zero_utils ) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp index eb6a3de57e41fc..19023a1fca883f 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp @@ -7,8 +7,8 @@ #include #include +#include "intel_npu/common/igraph.hpp" #include "intel_npu/config/common.hpp" -#include "intel_npu/icompiler.hpp" #include "openvino/runtime/icompiled_model.hpp" namespace intel_npu { @@ -17,17 +17,10 @@ class ICompiledModel : public ov::ICompiledModel { public: using ov::ICompiledModel::ICompiledModel; - virtual const std::shared_ptr& get_network_description() const = 0; + virtual const std::shared_ptr& get_graph() const = 0; virtual const Config& get_config() const = 0; - // Compiler is used for post-processing profiling data when using PERF_COUNT property - virtual const ov::SoPtr& get_compiler() const = 0; - - const NetworkMetadata& get_network_metadata() const { - return get_network_description()->metadata; - } - protected: std::shared_ptr shared_from_this() const { return std::dynamic_pointer_cast(ov::ICompiledModel::shared_from_this()); diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp new file mode 100644 index 00000000000000..51c4a4cf26eafd --- /dev/null +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -0,0 +1,103 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "intel_npu/network_metadata.hpp" +#include "intel_npu/utils/zero/zero_utils.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" +#include "openvino/runtime/profiling_info.hpp" + +namespace intel_npu { + +class IGraph : public std::enable_shared_from_this { +public: + IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, std::optional> blob) + : _handle(handle), + _metadata(std::move(metadata)) { + if (blob.has_value()) { + _blob = std::move(*blob); + } + } + + virtual void export_blob(std::ostream& stream) const = 0; + + virtual std::vector process_profiling_output(const std::vector& profData, + const Config& config) const = 0; + + virtual void set_argument_value(uint32_t argi, const void* argv) const = 0; + + virtual void initialize(const Config& config) = 0; + + virtual ~IGraph() = default; + + const NetworkMetadata& get_metadata() const { + return _metadata; + } + + ze_graph_handle_t get_handle() const { + return _handle; + } + + void update_network_name(std::string_view name) { + _metadata.name = name; + } + + inline const std::vector& get_input_descriptors() const { + return _input_descriptors; + } + + inline const std::vector& get_output_descriptors() const { + return _output_descriptors; + } + + inline const std::shared_ptr& get_command_queue() const { + return _command_queue; + } + + void set_workload_type(const ov::WorkloadType workloadType) const { + if (_command_queue == nullptr) { + return; + } + + ze_command_queue_workload_type_t zeWorkloadType; + switch (workloadType) { + case ov::WorkloadType::DEFAULT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; + break; + case ov::WorkloadType::EFFICIENT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; + break; + default: + OPENVINO_THROW("Unknown value for WorkloadType!"); + } + + _command_queue->setWorkloadType(zeWorkloadType); + } + + std::mutex& get_mutex() { + return _mutex; + } + +protected: + ze_graph_handle_t _handle = nullptr; + NetworkMetadata _metadata; + + std::vector _input_descriptors; + std::vector _output_descriptors; + + std::shared_ptr _command_queue; + + // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the + // first inference starts running + std::mutex _mutex; + + std::vector _blob; +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp index 8c1eb57fe34fc3..b34f2deee6c61e 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp @@ -7,9 +7,9 @@ #include #include "intel_npu/common/icompiled_model.hpp" +#include "intel_npu/common/igraph.hpp" #include "intel_npu/common/sync_infer_request.hpp" #include "intel_npu/config/config.hpp" -#include "intel_npu/icompiler.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" #include "openvino/runtime/iremote_context.hpp" #include "openvino/runtime/properties.hpp" @@ -54,11 +54,14 @@ class IEngineBackend : public std::enable_shared_from_this { //------------------------------------------------------------------------------ -class IExecutor { +class ICompilerAdapter { public: - virtual ~IExecutor() = default; + virtual std::shared_ptr compile(const std::shared_ptr& model, + const Config& config) const = 0; + virtual std::shared_ptr parse(std::vector network, const Config& config) const = 0; + virtual ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const = 0; - virtual void setWorkloadType(const ov::WorkloadType workloadType) const = 0; + virtual ~ICompilerAdapter() = default; }; //------------------------------------------------------------------------------ @@ -67,10 +70,6 @@ class IDevice : public std::enable_shared_from_this { public: using Uuid = ov::device::UUID; - virtual std::shared_ptr createExecutor( - const std::shared_ptr& networkDescription, - const Config& config) = 0; - virtual std::string getName() const = 0; virtual std::string getFullDeviceName() const = 0; virtual Uuid getUuid() const; @@ -85,7 +84,6 @@ class IDevice : public std::enable_shared_from_this { virtual std::shared_ptr createInferRequest( const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) = 0; virtual void updateInfo(const Config& config) = 0; diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp index 99f9ce7cb0eb28..788ce87136a04d 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp @@ -5,8 +5,9 @@ #pragma once #include "intel_npu/common/icompiled_model.hpp" +#include "intel_npu/common/igraph.hpp" #include "intel_npu/common/variable_state.hpp" -#include "intel_npu/icompiler.hpp" +#include "intel_npu/network_metadata.hpp" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/iplugin.hpp" diff --git a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp index 0ae0832fe29d72..0eeefccf43906d 100644 --- a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp @@ -21,7 +21,7 @@ namespace intel_npu { SyncInferRequest::SyncInferRequest(const std::shared_ptr& compiledModel, const Config& config) : _compiledModel(compiledModel), - _metadata(compiledModel->get_network_metadata()), + _metadata(compiledModel->get_graph()->get_metadata()), _logger("SyncInferRequest", config.get()), _userInputTensors(_metadata.inputs.size(), std::vector>(1, {nullptr})), _userOutputTensors(_metadata.outputs.size(), {nullptr}) { diff --git a/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp deleted file mode 100644 index addd9ca5308c65..00000000000000 --- a/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "intel_npu/common/npu.hpp" -#include "intel_npu/icompiler.hpp" -#include "intel_npu/utils/logger/logger.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -/** - * @brief Adapter for Compiler in driver - * @details Wrap compiler in driver calls and do preliminary actions (like opset conversion) - */ -class LevelZeroCompilerAdapter final : public ICompiler { -public: - LevelZeroCompilerAdapter(std::shared_ptr iEngineBackend); - - uint32_t getSupportedOpsetVersion() const override final; - - NetworkDescription compile(const std::shared_ptr& model, - const Config& config) const override final; - - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override final; - - NetworkMetadata parse(const std::vector& network, const Config& config) const override final; - - std::vector process_profiling_output(const std::vector& profData, - const std::vector& network, - const Config& config) const override final; - - void release(std::shared_ptr networkDescription) override; - - CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) override; - -private: - /** - * @brief Separate externals calls to separate class - */ - std::shared_ptr apiAdapter; - Logger _logger; -}; - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp b/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp deleted file mode 100644 index 5641408dffcac0..00000000000000 --- a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include - -#include -#include - -#include "intel_npu/icompiler.hpp" -#include "intel_npu/utils/logger/logger.hpp" -#include "intel_npu/utils/zero/zero_api.hpp" -#include "zero_executor.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -using SerializedIR = std::pair>; - -#define NotSupportQuery(T) (T == ZE_GRAPH_EXT_VERSION_1_2) - -// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, -// pfnQueryNetworkGetSupportedLayers) -#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) -#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T))) - -// For ext version >= 1.5, pfnCreate2 api is avaible -#define NotSupportGraph2(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to -// "ze_graph_dditable_ext_1_6_t". -// See: E#117498 -#define NotSupportArgumentMetadata(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5) - -#define UseCopyForNativeBinary(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5 || T == ZE_GRAPH_EXT_VERSION_1_6) - -/** - * Adapter to use CiD through ZeroAPI - */ -template -class LevelZeroCompilerInDriver final : public ICompiler { -public: - LevelZeroCompilerInDriver(ze_driver_handle_t driverHandle, - ze_device_handle_t deviceHandle, - ze_context_handle_t zeContext, - ze_graph_dditable_ext_curr_t& graph_ddi_table_ext); - LevelZeroCompilerInDriver(const LevelZeroCompilerInDriver&) = delete; - LevelZeroCompilerInDriver& operator=(const LevelZeroCompilerInDriver&) = delete; - ~LevelZeroCompilerInDriver() override; - - uint32_t getSupportedOpsetVersion() const override final; - - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; - - NetworkDescription compile(const std::shared_ptr& model, - const Config& config) const override final; - - ze_result_t seriazlideIRModelAndCreateGraph(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - ze_graph_handle_t& graphHandle) const; - - NetworkMetadata parse(const std::vector& network, const Config& config) const override final; - - std::vector process_profiling_output(const std::vector& profData, - const std::vector& network, - const Config& config) const override final { - OPENVINO_THROW("Profiling post-processing is not implemented."); - } - - template = true> - std::unordered_set getQueryResultFromSupportedLayers( - ze_result_t result, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - /** - * @brief Serialize input / output information to string format. - * @details Format: - * --inputs_precisions="0: [1:]" - * --inputs_layouts="0: [1:]" - * --outputs_precisions="0:" - * --outputs_layouts="0:" - * - * For older compiler versions, the name of the inputs/outputs may be used instead of their indices. - * - * Since the layout information is no longer an important part of the metadata values when using the 2.0 OV - * API, the layout fields shall be filled with default values in order to assure the backward compatibility - * with the driver. - */ - static std::string serializeIOInfo(const std::shared_ptr& model, const bool useIndices); - - void release(std::shared_ptr networkDescription) override; - - CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) override; - -private: - NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const; - - SerializedIR serializeIR(const std::shared_ptr& model, - ze_graph_compiler_version_info_t compilerVersion) const; - std::string serializeConfig(const Config& config, ze_graph_compiler_version_info_t& compilerVersion) const; - - template = true> - void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const; - - template = true> - void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const; - - template = true> - void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - std::vector& /* unusedBlob */, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - ze_result_t seriazlideIRModelAndQueryNetworkCreateV2(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - const ze_device_handle_t& _deviceHandle, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - template = true> - ze_result_t seriazlideIRModelAndQueryNetworkCreateV1(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - const ze_device_handle_t& _deviceHandle, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, - // pfnQueryNetworkGetSupportedLayers) - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - // For ext version < 1.3 - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - template = true> - ze_result_t createGraph(const ze_graph_format_t& format, - const SerializedIR& serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - - template = true> - ze_result_t createGraph(const ze_graph_format_t& format, - const SerializedIR& serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - -private: - ze_driver_handle_t _driverHandle = nullptr; - ze_device_handle_t _deviceHandle = nullptr; - ze_context_handle_t _context = nullptr; - - ze_graph_dditable_ext_curr_t& _graphDdiTableExt; - Logger _logger; -}; - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp deleted file mode 100644 index 0406b375609044..00000000000000 --- a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "driver_compiler_adapter.hpp" - -#include "graph_transformations.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/utils/zero/zero_api.hpp" -#include "intel_npu/utils/zero/zero_result.hpp" -#include "ze_intel_npu_uuid.h" -#include "zero_backend.hpp" -#include "zero_compiler_in_driver.hpp" -#include "zero_init.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -LevelZeroCompilerAdapter::LevelZeroCompilerAdapter(std::shared_ptr iEngineBackend) - : _logger("LevelZeroCompilerAdapter", Logger::global().level()) { - _logger.debug("initialize LevelZeroCompilerAdapter start"); - - std::shared_ptr zeroBackend = nullptr; - zeroBackend = std::dynamic_pointer_cast(iEngineBackend); - if (!zeroBackend) { - OPENVINO_THROW("LevelZeroCompilerAdapter init failed to cast zeroBackend, zeroBackend is a nullptr"); - } - - ze_context_handle_t zeContext = static_cast(zeroBackend->getContext()); - ze_driver_handle_t driverHandle = static_cast(zeroBackend->getDriverHandle()); - ze_device_handle_t deviceHandle = static_cast(zeroBackend->getDeviceHandle()); - ze_graph_dditable_ext_curr_t& graph_ddi_table_ext = zeroBackend->getGraphDdiTable(); - - uint32_t graphExtVersion = graph_ddi_table_ext.version(); - - if (driverHandle == nullptr) { - OPENVINO_THROW("LevelZeroCompilerAdapter failed to get properties about zeDriver"); - } - - _logger.info("LevelZeroCompilerAdapter creating adapter using graphExtVersion"); - - switch (graphExtVersion) { - case ZE_GRAPH_EXT_VERSION_1_3: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_4: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_5: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_6: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_7: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_8: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - default: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - } - - _logger.info("initialize LevelZeroCompilerAdapter complete, using graphExtVersion: %d.%d", - ZE_MAJOR_VERSION(graphExtVersion), - ZE_MINOR_VERSION(graphExtVersion)); -} - -uint32_t LevelZeroCompilerAdapter::getSupportedOpsetVersion() const { - return apiAdapter->getSupportedOpsetVersion(); -} - -NetworkDescription LevelZeroCompilerAdapter::compile(const std::shared_ptr& model, - const Config& config) const { - _logger.debug("compile start"); - return apiAdapter->compile(model, config); -} - -ov::SupportedOpsMap LevelZeroCompilerAdapter::query(const std::shared_ptr& model, - const Config& config) const { - _logger.debug("query start"); - return apiAdapter->query(model, config); -} - -NetworkMetadata LevelZeroCompilerAdapter::parse(const std::vector& network, const Config& config) const { - _logger.debug("parse start"); - return apiAdapter->parse(network, config); -} - -std::vector LevelZeroCompilerAdapter::process_profiling_output(const std::vector&, - const std::vector&, - const Config&) const { - OPENVINO_THROW("Profiling post-processing is not implemented."); -} - -void LevelZeroCompilerAdapter::release(std::shared_ptr networkDescription) { - _logger.info("release - using adapter to release networkDescription"); - apiAdapter->release(std::move(networkDescription)); -} - -CompiledNetwork LevelZeroCompilerAdapter::getCompiledNetwork(const NetworkDescription& networkDescription) { - _logger.info("getCompiledNetwork - using adapter to perform getCompiledNetwork(networkDescription)"); - return apiAdapter->getCompiledNetwork(networkDescription); -} - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp deleted file mode 100644 index 8f7ac4198bb0a4..00000000000000 --- a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp +++ /dev/null @@ -1,1081 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "zero_compiler_in_driver.hpp" - -#include -#include - -#include "graph_transformations.hpp" -#include "intel_npu/common/itt.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/config/compiler.hpp" -#include "intel_npu/config/runtime.hpp" -#include "intel_npu/prefix.hpp" -#include "intel_npu/utils/zero/zero_result.hpp" -#include "openvino/core/model.hpp" - -namespace { - -constexpr std::string_view INPUTS_PRECISIONS_KEY = "--inputs_precisions"; -constexpr std::string_view INPUTS_LAYOUTS_KEY = "--inputs_layouts"; -constexpr std::string_view OUTPUTS_PRECISIONS_KEY = "--outputs_precisions"; -constexpr std::string_view OUTPUTS_LAYOUTS_KEY = "--outputs_layouts"; - -//