openvinotoolkit · dmitry-gorokhov · Dec 18, 2024 · Jun 26, 2024 · Aug 13, 2024 · Aug 16, 2024
diff --git a/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp b/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp
@@ -26,6 +26,9 @@ class LP_TRANSFORMATIONS_API MatMulTransformation : public LayerTransformation {
     bool transform(TransformationContext &context, ov::pass::pattern::Matcher &m) override;
     bool isPrecisionPreserved(std::shared_ptr<Node> layer) const noexcept override;
     bool canBeTransformed(const TransformationContext& context, std::shared_ptr<Node> layer) const override;
+
+protected:
+    virtual void handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const {}
 };
 
 }  // namespace low_precision

diff --git a/...ommon/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp b/...ommon/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include "mat_mul.hpp"
+
+namespace ov {
+namespace pass {
+namespace low_precision {
+
+/**
+ * @ingroup ov_transformation_common_api
+ * @brief MatMulWithDequantizationTransformation propagates dequantization operations through MatMul operation and keep dequantisation as is.
+ *
+ * For more details about the transformation, refer to
+ * [MatMulWithDequantizationTransformation](@ref openvino_docs_OV_UG_lpt_MatMulWithDequantizationTransformation) page
+ * in the OpenVINO Developer Guide.
+ */
+class LP_TRANSFORMATIONS_API MatMulWithDequantizationTransformation : public MatMulTransformation {
+public:
+    OPENVINO_RTTI("MatMulWithDequantizationTransformation", "0");
+    MatMulWithDequantizationTransformation(const Params& params = Params());
+
+protected:
+    void handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const override;
+};
+
+}  // namespace low_precision
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp
@@ -52,7 +52,11 @@
 #include "low_precision/fake_quantize.hpp"
 #include "low_precision/group_convolution.hpp"
 #include "low_precision/interpolate.hpp"
+#ifdef OPENVINO_ARCH_ARM64
+#include "low_precision/mat_mul_with_dequantization.hpp"
+#else
 #include "low_precision/mat_mul.hpp"
+#endif
 #include "low_precision/max_pool.hpp"
 #include "low_precision/multiply_partial.hpp"
 #include "low_precision/mvn.hpp"
@@ -251,7 +255,11 @@ bool ov::pass::low_precision::LowPrecision::run_on_model(const std::shared_ptr<o
     ADD_MATCHER(common, FakeQuantizeTransformation, params)
     ADD_MATCHER(common, InterpolateTransformation, params)
     ADD_MATCHER(common, GroupConvolutionTransformation, params)
+#ifdef OPENVINO_ARCH_ARM64
+    ADD_MATCHER(common, MatMulWithDequantizationTransformation, params)
+#else
     ADD_MATCHER(common, MatMulTransformation, params)
+#endif
     ADD_MATCHER(common, MaxPoolTransformation, params)
     ADD_MATCHER(common, MultiplyPartialTransformation, params)
     ADD_MATCHER(common, MVNTransformation, params)

diff --git a/src/common/low_precision_transformations/src/mat_mul.cpp b/src/common/low_precision_transformations/src/mat_mul.cpp
@@ -12,6 +12,7 @@
 #include "openvino/pass/pattern/op/or.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 
+#include "low_precision/rt_info/bias_attribute.hpp"
 #include "low_precision/network_helper.hpp"
 #include "openvino/util/log.hpp"
 #include "itt.hpp"
@@ -176,6 +177,8 @@ bool MatMulTransformation::transform(TransformationContext &context, ov::pass::p
 
     updateOutput(context, newMultiply, newMatMul);
 
+    handleDequantization(newMultiply);
+
     OPENVINO_DEBUG("LPT: done: ", newMatMul);
     return true;
 }

diff --git a/src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp b/src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "low_precision/mat_mul_with_dequantization.hpp"
+
+#include <memory>
+#include "low_precision/rt_info/bias_attribute.hpp"
+
+using namespace ov;
+using namespace ov::pass;
+using namespace ov::pass::low_precision;
+
+MatMulWithDequantizationTransformation::MatMulWithDequantizationTransformation(const Params& params) : MatMulTransformation(params) {
+}
+
+void MatMulWithDequantizationTransformation::handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const {
+    const auto& dequantization_constant = is_type<opset1::Constant>(dequantization->get_input_node_shared_ptr(1)) ?
+        as_type<opset1::Constant>(dequantization->get_input_node_ptr(1)) :
+        as_type<opset1::Constant>(dequantization->get_input_node_ptr(0));
+    if ((dequantization_constant == nullptr) || (ov::shape_size(dequantization_constant->get_shape()) != 1ull)) {
+        return;
+    }
+
+    ov::mark_as_bias(dequantization);
+}
@@ -17,6 +17,7 @@
 #include "low_precision/common/ie_lpt_exception.hpp"
 #include "low_precision/layer_transformation.hpp"
 #include "low_precision/network_helper.hpp"
+#include "low_precision/rt_info/bias_attribute.hpp"
 #include "low_precision/rt_info/intervals_alignment_attribute.hpp"
 #include "low_precision/rt_info/precision_preserved_attribute.hpp"
 #include "low_precision/rt_info/quantization_alignment_attribute.hpp"
@@ -1192,7 +1193,7 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
     const std::shared_ptr<ov::opset1::Multiply> multiply = ov::as_type_ptr<ov::opset1::Multiply>(dataNode.get_node_shared_ptr());
     std::shared_ptr<ov::opset1::Constant> multiplyConstant;
     if (multiply != nullptr) {
-        if (!FakeQuantizeDequantization::checkShape(multiply)) {
+        if (!FakeQuantizeDequantization::checkShape(multiply) || ov::marked_as_bias(multiply)) {
             return FakeQuantizeDequantization();
         }
 
@@ -1207,6 +1208,9 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
     std::shared_ptr<ov::opset1::Convert> subtractConvert;
     std::shared_ptr<ov::opset1::Constant> subtractConstant;
     if (subtract != nullptr) {
+        if (ov::marked_as_bias(subtract)) {
+            return FakeQuantizeDequantization();
+        }
         if (!FakeQuantizeDequantization::checkShape(subtract)) {
             return FakeQuantizeDequantization(dataNode, nullptr, nullptr, nullptr, nullptr, multiply, multiplyConstant);
         }
@@ -1220,6 +1224,9 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
 
     const std::shared_ptr<ov::opset1::Convert> convert = ov::as_type_ptr<ov::opset1::Convert>(dataNode.get_node_shared_ptr());
     if (convert != nullptr) {
+        if (ov::marked_as_bias(convert)) {
+            return FakeQuantizeDequantization();
+        }
         auto el_type = convert->input(0).get_element_type();
         auto foundIt = std::find(defaultPrecisions.begin(), defaultPrecisions.end(), el_type);
         if (foundIt == defaultPrecisions.end() &&

diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -403,6 +403,7 @@ void DnnlMemoryBlock::notifyUpdate() {
 
 StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
     m_eng(eng), m_pMemDesc(desc) {
+    OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
     if (desc->getPrecision() == element::string) {
         OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
     }
@@ -412,7 +413,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo
 
     m_size = m_pMemDesc->getCurrentMemSize();
 
-    if (data) {
+    if (data || desc->empty()) {
         m_pMemBlock = std::make_shared<StaticMemoryBlock>(const_cast<void*>(data), m_size);
     } else {
         m_pMemBlock = std::make_shared<StaticMemoryBlock>(m_size);

@@ -38,9 +38,9 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr,
     }
 }
 
-static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                              const arm_compute::DataType& dataType,
-                              const arm_compute::DataLayout& dataLayout) {
+std::shared_ptr<arm_compute::TensorInfo> ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                                           const arm_compute::DataType& dataType,
+                                                                           const arm_compute::DataLayout& dataLayout) {
     std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
     if (dataType != arm_compute::DataType::UNKNOWN) {
         aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
@@ -72,6 +72,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     ACLTypes   aclDataType{};
     ACLLayouts aclDataLayout{};
     for (auto& cpu_mem_ptr : memory) {
+        if (cpu_mem_ptr.second->getSize() == 0) {
+            continue;
+        }
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
         initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
                             aclMemoryShapes[index],

@@ -47,6 +47,11 @@ class ACLCommonExecutor : public Executor {
 
 protected:
     ACLTensorAttrs aclTensorAttrs;
+
+    virtual std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                   const arm_compute::DataType& dataType,
+                                   const arm_compute::DataLayout& dataLayout);
+
 private:
     ACLTensors aclMemoryTensors;
     ACLFunction iFunction = nullptr;