From 82124738cbfb54a384e9c9aa98f0a8a5077bff06 Mon Sep 17 00:00:00 2001
From: Edward Shogulin <edward.shogulin@intel.com>
Date: Wed, 26 Jun 2024 01:21:15 +0100
Subject: [PATCH 1/5] [CPU] [ARM] FullyConnected: int8 support

---
 .../include/low_precision/mat_mul.hpp         |   3 +
 .../mat_mul_with_dequantization.hpp           |  33 ++++
 .../src/low_precision.cpp                     |   8 +
 .../src/mat_mul.cpp                           |   3 +
 .../src/mat_mul_with_dequantization.cpp       |  26 +++
 .../src/network_helper.cpp                    |   9 +-
 src/plugins/intel_cpu/src/cpu_memory.cpp      |   3 +-
 .../executors/acl/acl_common_executor.cpp     |   9 +-
 .../executors/acl/acl_common_executor.hpp     |   5 +
 .../executors/acl/acl_lowp_fullyconnected.cpp | 155 ++++++++++++++++++
 .../executors/acl/acl_lowp_fullyconnected.hpp |  50 ++++++
 .../src/nodes/executors/acl/acl_utils.hpp     |   2 +-
 .../src/nodes/executors/debug_messages.hpp    |   1 +
 .../fullyconnected_implementations.cpp        |  36 ++++
 .../aarch64/pass/snippets_mark_skipped.cpp    |   6 +
 .../transformation_pipeline.cpp               |   4 +
 .../intel_cpu/tests/functional/CMakeLists.txt |   4 +-
 .../fully_connected_transformation.cpp        | 103 ++++++++++++
 .../{ => x64}/add_transformation.cpp          |   0
 .../assign_and_read_value_transformation.cpp  |   0
 .../batch_to_space_transformation.cpp         |   0
 .../{ => x64}/clamp_transformation.cpp        |   0
 .../{ => x64}/concat_transformation.cpp       |   0
 .../concat_with_child_and_output.cpp          |   0
 ...t_with_different_precision_on_children.cpp |   0
 ...oncat_with_intermediate_transformation.cpp |   0
 .../concat_with_neighbors_transformation.cpp  |   0
 .../concat_with_split_transformation.cpp      |   0
 ...nvolution_backprop_data_transformation.cpp |   0
 .../convolution_qdq_transformation.cpp        |   0
 .../{ => x64}/convolution_transformation.cpp  |   0
 .../depth_to_space_transformation.cpp         |   0
 ...ntwise_branch_selection_transformation.cpp |   0
 ...eliminate_fake_quantize_transformation.cpp |   0
 .../fq_and_avg_pool_transformation.cpp        |   0
 .../fq_and_max_pool_transformation.cpp        |   0
 ...d_two_output_branches_with_convolution.cpp |   0
 .../fq_precision_selection_transformation.cpp |   0
 .../{ => x64}/fq_transformation.cpp           |   0
 .../fq_with_dq_not_optimal_transformation.cpp |   0
 .../fully_connected_transformation.cpp        |   5 +-
 .../{ => x64}/fuse_convert_transformation.cpp |   0
 .../fuse_dequantize_to_fq_transformation.cpp  |   0
 ...fuse_fq_and_scale_shift_transformation.cpp |   0
 .../fuse_multiply_to_fq_transformation.cpp    |   0
 .../fuse_subtract_to_fq_transformation.cpp    |   0
 .../{ => x64}/gather_transformation.cpp       |   0
 .../{ => x64}/gemm_transformation.cpp         |   0
 .../group_convolution_transformation.cpp      |   0
 .../groupconvolution_qdq_transformation.cpp   |   0
 .../{ => x64}/interpolate_transformation.cpp  |   0
 .../{ => x64}/mat_mul_transformation.cpp      |   0
 .../mat_mul_with_constant_transformation.cpp  |   0
 .../mat_mul_with_optimized_constant_fq.cpp    |   0
 .../move_fake_quantize_transformation.cpp     |   0
 .../multiply_to_group_convolution.cpp         |   0
 .../{ => x64}/multiply_transformation.cpp     |   0
 .../{ => x64}/multiply_with_one_parent.cpp    |   0
 .../{ => x64}/mvn_transformation.cpp          |   0
 .../{ => x64}/normalize_transformation.cpp    |   0
 .../{ => x64}/output_layers.cpp               |   0
 .../{ => x64}/output_layers_concat.cpp        |   0
 .../output_layers_concat_multi_channel.cpp    |   0
 .../{ => x64}/pad_transformation.cpp          |   0
 .../{ => x64}/prelu_transformation.cpp        |   0
 .../pull_reshape_through_dequantization.cpp   |   0
 .../recurrent_cell_transformation.cpp         |   0
 .../{ => x64}/reduce_max_transformation.cpp   |   0
 .../{ => x64}/reduce_mean_transformation.cpp  |   0
 .../{ => x64}/reduce_min_transformation.cpp   |   0
 .../{ => x64}/reduce_sum_transformation.cpp   |   0
 .../{ => x64}/relu_transformation.cpp         |   0
 .../{ => x64}/reshape_transformation.cpp      |   0
 .../shuffle_channels_transformation.cpp       |   0
 .../space_to_batch_transformation.cpp         |   0
 .../{ => x64}/split_transformation.cpp        |   0
 .../{ => x64}/squeeze_transformation.cpp      |   0
 .../strided_slice_transformation.cpp          |   0
 .../subtract_multiply_to_multiply_add.cpp     |   0
 .../{ => x64}/subtract_transformation.cpp     |   0
 .../transpose_after_matmul_transformation.cpp |   0
 .../{ => x64}/transpose_transformation.cpp    |   0
 .../{ => x64}/unsqueeze_transformation.cpp    |   0
 .../variadic_split_transformation.cpp         |   0
 .../skip_tests_config.cpp                     |   2 +
 .../fully_connected_transformation.cpp        |   3 +-
 .../fully_connected_transformation.hpp        |  17 +-
 .../mat_mul_transformation.hpp                |   2 +
 .../fully_connected_transformation.cpp        |  49 ++++--
 .../mat_mul_transformation.cpp                |  21 ++-
 .../layer_transformation.hpp                  |   2 +
 .../layer_transformation.cpp                  |  16 +-
 .../include/ov_lpt_models/mat_mul.hpp         |  13 +-
 .../ov_helpers/ov_lpt_models/src/mat_mul.cpp  | 143 +++++++++++++---
 94 files changed, 671 insertions(+), 62 deletions(-)
 create mode 100644 src/common/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp
 create mode 100644 src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp
 create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
 create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/add_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/assign_and_read_value_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/batch_to_space_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/clamp_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_with_child_and_output.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_with_different_precision_on_children.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_with_intermediate_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_with_neighbors_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/concat_with_split_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/convolution_backprop_data_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/convolution_qdq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/convolution_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/depth_to_space_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/elementwise_branch_selection_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/eliminate_fake_quantize_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_and_avg_pool_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_and_max_pool_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_and_two_output_branches_with_convolution.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_precision_selection_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fq_with_dq_not_optimal_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fully_connected_transformation.cpp (86%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fuse_convert_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fuse_dequantize_to_fq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fuse_fq_and_scale_shift_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fuse_multiply_to_fq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/fuse_subtract_to_fq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/gather_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/gemm_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/group_convolution_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/groupconvolution_qdq_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/interpolate_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/mat_mul_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/mat_mul_with_constant_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/mat_mul_with_optimized_constant_fq.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/move_fake_quantize_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/multiply_to_group_convolution.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/multiply_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/multiply_with_one_parent.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/mvn_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/normalize_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/output_layers.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/output_layers_concat.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/output_layers_concat_multi_channel.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/pad_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/prelu_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/pull_reshape_through_dequantization.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/recurrent_cell_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/reduce_max_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/reduce_mean_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/reduce_min_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/reduce_sum_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/relu_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/reshape_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/shuffle_channels_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/space_to_batch_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/split_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/squeeze_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/strided_slice_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/subtract_multiply_to_multiply_add.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/subtract_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/transpose_after_matmul_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/transpose_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/unsqueeze_transformation.cpp (100%)
 rename src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/{ => x64}/variadic_split_transformation.cpp (100%)

diff --git a/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp b/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp
index bc0077a716f701..1c1e32f1ee1a15 100644
--- a/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp
+++ b/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp
@@ -26,6 +26,9 @@ class LP_TRANSFORMATIONS_API MatMulTransformation : public LayerTransformation {
     bool transform(TransformationContext &context, ov::pass::pattern::Matcher &m) override;
     bool isPrecisionPreserved(std::shared_ptr<Node> layer) const noexcept override;
     bool canBeTransformed(const TransformationContext& context, std::shared_ptr<Node> layer) const override;
+
+protected:
+    virtual void handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const {}
 };
 
 }  // namespace low_precision
diff --git a/src/common/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp b/src/common/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp
new file mode 100644
index 00000000000000..4913d1cf1057f4
--- /dev/null
+++ b/src/common/low_precision_transformations/include/low_precision/mat_mul_with_dequantization.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include "mat_mul.hpp"
+
+namespace ov {
+namespace pass {
+namespace low_precision {
+
+/**
+ * @ingroup ov_transformation_common_api
+ * @brief MatMulWithDequantizationTransformation propagates dequantization operations through MatMul operation and keep dequantisation as is.
+ *
+ * For more details about the transformation, refer to
+ * [MatMulWithDequantizationTransformation](@ref openvino_docs_OV_UG_lpt_MatMulWithDequantizationTransformation) page
+ * in the OpenVINO Developer Guide.
+ */
+class LP_TRANSFORMATIONS_API MatMulWithDequantizationTransformation : public MatMulTransformation {
+public:
+    OPENVINO_RTTI("MatMulWithDequantizationTransformation", "0");
+    MatMulWithDequantizationTransformation(const Params& params = Params());
+
+protected:
+    void handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const override;
+};
+
+}  // namespace low_precision
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp
index e58374ed3e2b1a..246f60b2783b6f 100644
--- a/src/common/low_precision_transformations/src/low_precision.cpp
+++ b/src/common/low_precision_transformations/src/low_precision.cpp
@@ -52,7 +52,11 @@
 #include "low_precision/fake_quantize.hpp"
 #include "low_precision/group_convolution.hpp"
 #include "low_precision/interpolate.hpp"
+#ifdef OPENVINO_ARCH_ARM64
+#include "low_precision/mat_mul_with_dequantization.hpp"
+#else
 #include "low_precision/mat_mul.hpp"
+#endif
 #include "low_precision/max_pool.hpp"
 #include "low_precision/multiply_partial.hpp"
 #include "low_precision/mvn.hpp"
@@ -251,7 +255,11 @@ bool ov::pass::low_precision::LowPrecision::run_on_model(const std::shared_ptr<o
     ADD_MATCHER(common, FakeQuantizeTransformation, params)
     ADD_MATCHER(common, InterpolateTransformation, params)
     ADD_MATCHER(common, GroupConvolutionTransformation, params)
+#ifdef OPENVINO_ARCH_ARM64
+    ADD_MATCHER(common, MatMulWithDequantizationTransformation, params)
+#else
     ADD_MATCHER(common, MatMulTransformation, params)
+#endif
     ADD_MATCHER(common, MaxPoolTransformation, params)
     ADD_MATCHER(common, MultiplyPartialTransformation, params)
     ADD_MATCHER(common, MVNTransformation, params)
diff --git a/src/common/low_precision_transformations/src/mat_mul.cpp b/src/common/low_precision_transformations/src/mat_mul.cpp
index 15afe2408cc459..f1217e0612a466 100644
--- a/src/common/low_precision_transformations/src/mat_mul.cpp
+++ b/src/common/low_precision_transformations/src/mat_mul.cpp
@@ -12,6 +12,7 @@
 #include "openvino/pass/pattern/op/or.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 
+#include "low_precision/rt_info/bias_attribute.hpp"
 #include "low_precision/network_helper.hpp"
 #include "openvino/util/log.hpp"
 #include "itt.hpp"
@@ -176,6 +177,8 @@ bool MatMulTransformation::transform(TransformationContext &context, ov::pass::p
 
     updateOutput(context, newMultiply, newMatMul);
 
+    handleDequantization(newMultiply);
+
     OPENVINO_DEBUG("LPT: done: ", newMatMul);
     return true;
 }
diff --git a/src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp b/src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp
new file mode 100644
index 00000000000000..f03fcb5061a175
--- /dev/null
+++ b/src/common/low_precision_transformations/src/mat_mul_with_dequantization.cpp
@@ -0,0 +1,26 @@
+﻿// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "low_precision/mat_mul_with_dequantization.hpp"
+
+#include <memory>
+#include "low_precision/rt_info/bias_attribute.hpp"
+
+using namespace ov;
+using namespace ov::pass;
+using namespace ov::pass::low_precision;
+
+MatMulWithDequantizationTransformation::MatMulWithDequantizationTransformation(const Params& params) : MatMulTransformation(params) {
+}
+
+void MatMulWithDequantizationTransformation::handleDequantization(const std::shared_ptr<ov::opset1::Multiply>& dequantization) const {
+    const auto& dequantization_constant = is_type<opset1::Constant>(dequantization->get_input_node_shared_ptr(1)) ?
+        as_type<opset1::Constant>(dequantization->get_input_node_ptr(1)) :
+        as_type<opset1::Constant>(dequantization->get_input_node_ptr(0));
+    if ((dequantization_constant == nullptr) || (ov::shape_size(dequantization_constant->get_shape()) != 1ull)) {
+        return;
+    }
+
+    ov::mark_as_bias(dequantization);
+}
diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp
index 1aebfb111d7892..c4afc48e4360f7 100644
--- a/src/common/low_precision_transformations/src/network_helper.cpp
+++ b/src/common/low_precision_transformations/src/network_helper.cpp
@@ -17,6 +17,7 @@
 #include "low_precision/common/ie_lpt_exception.hpp"
 #include "low_precision/layer_transformation.hpp"
 #include "low_precision/network_helper.hpp"
+#include "low_precision/rt_info/bias_attribute.hpp"
 #include "low_precision/rt_info/intervals_alignment_attribute.hpp"
 #include "low_precision/rt_info/precision_preserved_attribute.hpp"
 #include "low_precision/rt_info/quantization_alignment_attribute.hpp"
@@ -1192,7 +1193,7 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
     const std::shared_ptr<ov::opset1::Multiply> multiply = ov::as_type_ptr<ov::opset1::Multiply>(dataNode.get_node_shared_ptr());
     std::shared_ptr<ov::opset1::Constant> multiplyConstant;
     if (multiply != nullptr) {
-        if (!FakeQuantizeDequantization::checkShape(multiply)) {
+        if (!FakeQuantizeDequantization::checkShape(multiply) || ov::marked_as_bias(multiply)) {
             return FakeQuantizeDequantization();
         }
 
@@ -1207,6 +1208,9 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
     std::shared_ptr<ov::opset1::Convert> subtractConvert;
     std::shared_ptr<ov::opset1::Constant> subtractConstant;
     if (subtract != nullptr) {
+        if (ov::marked_as_bias(subtract)) {
+            return FakeQuantizeDequantization();
+        }
         if (!FakeQuantizeDequantization::checkShape(subtract)) {
             return FakeQuantizeDequantization(dataNode, nullptr, nullptr, nullptr, nullptr, multiply, multiplyConstant);
         }
@@ -1220,6 +1224,9 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt
 
     const std::shared_ptr<ov::opset1::Convert> convert = ov::as_type_ptr<ov::opset1::Convert>(dataNode.get_node_shared_ptr());
     if (convert != nullptr) {
+        if (ov::marked_as_bias(convert)) {
+            return FakeQuantizeDequantization();
+        }
         auto el_type = convert->input(0).get_element_type();
         auto foundIt = std::find(defaultPrecisions.begin(), defaultPrecisions.end(), el_type);
         if (foundIt == defaultPrecisions.end() &&
diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
index 8e5fe8d72fd1f2..4fed8e2ef860b2 100644
--- a/src/plugins/intel_cpu/src/cpu_memory.cpp
+++ b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -403,6 +403,7 @@ void DnnlMemoryBlock::notifyUpdate() {
 
 StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
     m_eng(eng), m_pMemDesc(desc) {
+    OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
     if (desc->getPrecision() == element::string) {
         OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
     }
@@ -412,7 +413,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo
 
     m_size = m_pMemDesc->getCurrentMemSize();
 
-    if (data) {
+    if (data || desc->empty()) {
         m_pMemBlock = std::make_shared<StaticMemoryBlock>(const_cast<void*>(data), m_size);
     } else {
         m_pMemBlock = std::make_shared<StaticMemoryBlock>(m_size);
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
index 5779147a5b3352..32e6e29736ca1b 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -38,9 +38,9 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr,
     }
 }
 
-static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                              const arm_compute::DataType& dataType,
-                              const arm_compute::DataLayout& dataLayout) {
+std::shared_ptr<arm_compute::TensorInfo> ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                                           const arm_compute::DataType& dataType,
+                                                                           const arm_compute::DataLayout& dataLayout) {
     std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
     if (dataType != arm_compute::DataType::UNKNOWN) {
         aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
@@ -72,6 +72,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     ACLTypes   aclDataType{};
     ACLLayouts aclDataLayout{};
     for (auto& cpu_mem_ptr : memory) {
+        if (cpu_mem_ptr.second->getSize() == 0) {
+            continue;
+        }
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
         initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
                             aclMemoryShapes[index],
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
index 854130d6f884bb..97d632b8af1982 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
@@ -47,6 +47,11 @@ class ACLCommonExecutor : public Executor {
 
 protected:
     ACLTensorAttrs aclTensorAttrs;
+
+    virtual std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                   const arm_compute::DataType& dataType,
+                                   const arm_compute::DataLayout& dataLayout);
+
 private:
     ACLTensors aclMemoryTensors;
     ACLFunction iFunction = nullptr;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
new file mode 100644
index 00000000000000..9a9a2832063be9
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_lowp_fullyconnected.hpp"
+
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "nodes/executors/acl/acl_utils.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/implementation_utils.hpp"
+#include "acl_weights.hpp"
+#include "acl_utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+static bool checkPostOps(const PostOps &postOps) {
+    if (postOps.empty()) {
+        return true;
+    }
+
+    if (postOps.size() != 1) {
+        return false;
+    }
+
+    const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0]);
+    return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()));
+}
+
+void initFCAttrs(const FCAttrs &attrs,
+                 ACLTensorAttrs& aclTensorAttrs,
+                 ACLFCAttrs& aclfcAttrs,
+                 const MemoryArgs &memory,
+                 arm_compute::GEMMInfo& gemmInfo,
+                 const PostOps &postOps) {
+    aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+    // TODO: not completed
+    //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
+    aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
+    // TODO: not completed
+    //fullyConnectedLayerInfo.transpose_weights = false;
+    gemmInfo.set_pretranspose_A(false);
+    gemmInfo.set_pretranspose_B(false);
+    aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
+
+    if (!postOps.empty()) {
+        auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0]);
+        fullyConnectedLayerInfo.set_activation_info(getActivationLayerInfo(
+                convertToEltwiseAlgorithm(activation->type()),
+                activation->alpha(), activation->beta(), activation->gamma()));
+    }
+
+    if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
+        aclfcAttrs.isConvertedWeights = true;
+    }
+}
+
+ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs,
+                                                             const PostOps &postOps,
+                                                             const MemoryArgs &memory,
+                                                             const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) {
+    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
+    packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
+}
+
+bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
+    const auto src0 = srcType(config);
+    const auto src1 = weiType(config);
+    const auto dst = dstType(config);
+    if ((src0 != ov::element::i8) || (src1 != ov::element::i8) || (dst != ov::element::f32)) {
+        return false;
+    }
+
+    VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS);
+    VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
+    VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
+    VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
+    return true;
+}
+
+void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
+    updateFCTensorsShapes(aclMemoryShapes);
+}
+
+arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) {
+    // TODO: debug only
+    //const auto src0 = aclMemoryInfos[ACLArgs::ACL_SRC_0].get();
+    //const auto src1 = aclMemoryInfos[ACLArgs::ACL_WEI].get();
+    //const auto dst = aclMemoryInfos[ACLArgs::ACL_DST].get();
+
+    auto &tensor_info = aclMemoryInfos[ACLArgs::ACL_SRC_0];
+    if (dequantizationScales.empty()) {
+        tensor_info->set_quantization_info(arm_compute::QuantizationInfo(1.f));
+    } else {
+        tensor_info->set_quantization_info(arm_compute::QuantizationInfo(dequantizationScales[0]));
+    }
+
+    auto& tensor_info_weights = aclMemoryInfos[ACLArgs::ACL_WEI];
+    tensor_info_weights->set_quantization_info(arm_compute::QuantizationInfo(1.f));
+
+    const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
+            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+            nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_DST].get(),
+            gemmInfo);
+    return matMulValid;
+}
+
+ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLTensors & aclMemoryTensors) {
+    auto gemm = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
+    gemm->configure(
+            aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryTensors[ACLArgs::ACL_WEI].get(),
+            aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
+            aclMemoryTensors.at(ACLArgs::ACL_DST).get(),
+            gemmInfo);
+
+    if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) {
+        aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false;
+        aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData());
+    }
+    return gemm;
+}
+
+// TODO: move to ACLLowpExecutor
+std::shared_ptr<arm_compute::TensorInfo> ACLLowpFullyConnectedExecutor::initTensorInfo(
+        const arm_compute::TensorShape& tensorShape,
+        const arm_compute::DataType& dataType,
+        const arm_compute::DataLayout& dataLayout) {
+    arm_compute::DataType result;
+    switch (dataType) {
+        case arm_compute::DataType::S8: {
+            result = arm_compute::DataType::QASYMM8_SIGNED;
+            break;
+        }
+        case arm_compute::DataType::U8: {
+            result = arm_compute::DataType::QASYMM8;
+            break;
+        }
+        default: {
+            result = dataType;
+            break;
+        }
+    }
+
+    return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
new file mode 100644
index 00000000000000..4d58c7908e61c4
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "acl_common_executor.hpp"
+#include "nodes/executors/fullyconnected_config.hpp"
+#include "acl_weights.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor {
+public:
+    ACLLowpFullyConnectedExecutor(const FCAttrs& attrs,
+                                  const PostOps& postOps,
+                                  const MemoryArgs& memory,
+                                  const ExecutorContext::CPtr& context);
+
+    static bool supports(const FCConfig& config);
+
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;
+
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+
+    impl_desc_type implType() const override {
+        return impl_desc_type::gemm_acl;
+    }
+
+protected:
+    std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                           const arm_compute::DataType& dataType,
+                           const arm_compute::DataLayout& dataLayout) override;
+
+private:
+    arm_compute::GEMMInfo gemmInfo;
+    arm_compute::WeightsInfo weightsInfo;
+
+    MemoryCPtr packedWeights;
+    ACLFCAttrs aclfcAttrs;
+    std::vector<float> dequantizationScales;
+};
+
+using ACLLowpFullyConnectedExecutorPtr = std::shared_ptr<ACLLowpFullyConnectedExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
index a3d151192e601b..e20ba4f9283077 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -110,7 +110,7 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis
 * @param precision precision to be converted
 * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
 */
-inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) {
+inline arm_compute::DataType precisionToAclDataType(const ov::element::Type& precision) {
     switch (precision) {
         case ov::element::i8:    return arm_compute::DataType::S8;
         case ov::element::u8:    return arm_compute::DataType::U8;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp
index 26ae6ace59631b..206842014365a0 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp
@@ -18,6 +18,7 @@
 #define UNSUPPORTED_DST_RANK " unsupported dst rank"
 #define UNSUPPORTED_DST_STRIDES " unsupported dst strides"
 #define HEURISTICS_MISMATCH " heuristics mismatch"
+#define UNSUPPORTED_PER_CHANNEL_QUANTIZATION " unsupported per-channel quantization"
 
 #define VERIFY(condition, ...) \
     do { \
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index 5a8b1ef78b6dbb..92b721726a9e2f 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -30,6 +30,7 @@
 
 #if defined(OV_CPU_WITH_ACL)
 #include "nodes/executors/acl/acl_fullyconnected.hpp"
+#include "nodes/executors/acl/acl_lowp_fullyconnected.hpp"
 #endif
 
 #if defined(OV_CPU_WITH_SHL)
@@ -87,6 +88,11 @@ static const TypeMapping aclFCTypeMapping {
     {{_any, _any, _any, _any},               pt(just<f32>(), just<f32>(), just<f32>(), just<f32>())}
 };
 
+static const TypeMapping aclLowpFCTypeMapping {
+    // {src, wei, bia, dst}                  pt<src, wei, bias, dst>
+    {{_i8, _i8, _any, _f32},                 pt(just<i8>(), just<i8>(), just<i32>(), just<f32>())}
+};
+
 static const MappingNotation dnnlConvolutionMappingNotation {
     ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST
 };
@@ -370,6 +376,36 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
                const ExecutorContext::CPtr context) {
                 return std::make_shared<ACLFullyConnectedExecutor>(attrs, postOps, memory, context);
             })
+        OV_CPU_INSTANCE_ACL(
+            "fullyconnected_acl_lowp",
+            ExecutorType::Acl,
+            OperationType::FullyConnected,
+            ShapeTolerance::Agnostic,
+            // supports
+            [](const FCConfig& config) -> bool {
+                VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
+                VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
+                return ACLLowpFullyConnectedExecutor::supports(config);
+            },
+            // requiresFallback
+            [](const FCConfig& config) -> ov::optional<executor::Config<FCAttrs>> {
+                return requiresFallbackCommon(config,
+                                              aclLowpFCTypeMapping,
+                                              aclFCLayoutConfig,
+                                              aclFullyConnectedMappingNotation);
+            },
+            // acceptsShapes
+            [](const MemoryArgs& memory) -> bool {
+                // @todo create syntactic sugar (functor) for shape agnostic lambda
+                return true;
+            },
+            // create
+            [](const FCAttrs& attrs,
+               const PostOps& postOps,
+               const MemoryArgs& memory,
+               const ExecutorContext::CPtr context) {
+                return std::make_shared<ACLLowpFullyConnectedExecutor>(attrs, postOps, memory, context);
+            })
         OV_CPU_INSTANCE_SHL(
             "fullyconnected_shl",
             ExecutorType::Shl,
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
index c38d088ef95e7b..7343497e855f5b 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
@@ -7,6 +7,7 @@
 #include "snippets/op/subgraph.hpp"
 #include "snippets/utils/utils.hpp"
 
+#include "low_precision/rt_info/bias_attribute.hpp"
 #include "transformations/utils/utils.hpp"
 #include "transformations/utils.hpp"
 #include "utils/general_utils.h"
@@ -227,6 +228,11 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
     for (auto &node : m->get_ordered_ops()) {
         if (is_skipped_op(node))
             continue;
+
+        if (ov::marked_as_bias(node)) {
+            SetNodeFusingType(node, NodeFusingType::FusedWithMisc);
+        }
+
         if (isSuitableConvolutionParent(node)) {
             // Initiate fusing chain
             SetNodeFusingType(node, NodeFusingType::FusedWithConvolution);
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 38649b2906e9e3..9215906925b46e 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -725,7 +725,11 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
                 }),
             PrecisionsRestriction::create<ov::opset1::MatMul>({
                     {{0}, {ov::element::u8, ov::element::i8}},
+#if defined(OPENVINO_ARCH_ARM64)
+                    {{1}, {ov::element::u8, ov::element::i8}}
+#else
                     {{1}, {ov::element::i8}}
+#endif
                 }),
             PrecisionsRestriction::create<ov::opset5::LSTMSequence>({
                     {{0, 1}, {ov::element::u8}}
diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
index 3092356e1189b6..c5ab0f6c5ede57 100644
--- a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
@@ -59,6 +59,7 @@ if(NOT (ARM OR AARCH64))
          ${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests/instances/arm
          ${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/arm
          ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/snippets/arm
+         ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/low_precision_transformations/arm
          ${CMAKE_CURRENT_SOURCE_DIR}/utils/arm)
 else()
     # temporary disable all custom tests for ARM
@@ -81,7 +82,8 @@ endif()
 if(NOT X86_64)
     list(APPEND EXCLUDED_SOURCE_PATHS
          ${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests/instances/x64
-         ${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/x64)
+         ${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/x64
+         ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/low_precision_transformations/x64)
 endif()
 
 ov_add_test_target(
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp
new file mode 100644
index 00000000000000..f764ea3ca1156e
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "low_precision_transformations/fully_connected_transformation.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+const std::vector<ov::element::Type> netPrecisions = {
+        ov::element::f32
+};
+
+const std::vector<FullyConnectedShapes> shapes = {
+    {
+        ov::PartialShape{ 1, 16 },
+        ov::PartialShape{ 16, 8 },
+        false,
+        false
+    },
+    {
+        ov::PartialShape{ 1, 1, 16 },
+        ov::PartialShape{ 1, 16, 8 },
+        false,
+        false
+    },
+    {
+        ov::PartialShape{ 1, 16 },
+        ov::PartialShape{ 8, 16 },
+        false,
+        true
+    },
+    {
+        ov::PartialShape{ 1, 1, 16 },
+        ov::PartialShape{ 1, 8, 16 },
+        false,
+        true
+    },
+    {
+        ov::PartialShape{ 16, 1 },
+        ov::PartialShape{ 16, 8 },
+        true,
+        false
+    },
+    {
+        ov::PartialShape{ 1, 16, 1 },
+        ov::PartialShape{ 1, 16, 8 },
+        true,
+        false
+    },
+    {
+        ov::PartialShape{ 16, 1 },
+        ov::PartialShape{ 8, 16 },
+        true,
+        true
+    },
+    {
+        ov::PartialShape{ 1, 16, 1 },
+        ov::PartialShape{ 1, 8, 16 },
+        true,
+        true
+    }
+};
+
+const std::vector<ov::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
+    LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams()
+};
+
+const std::vector<FullyConnectedParams> activations = {
+    {
+        true,  // activation
+        false, // per-channel
+        true,  // FQ
+        "fullyConnected,fullyConnected/DequantizationMultiply,relu"
+    },
+    {
+        false,  // activation
+        false,  // per-channel
+        true,   // FQ
+        "fullyConnected_original,fullyConnected"
+    },
+    {
+        true,  // activation
+        true,  // per-channel
+        false, // FQ
+        "fullyConnected,relu_original" // dequantization is not supported for per-channel quantization
+    },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::ValuesIn(shapes),
+        ::testing::Values(ov::test::utils::DEVICE_CPU),
+        ::testing::ValuesIn(trasformationParamValues),
+        ::testing::ValuesIn({ov::element::i8 /*, ov::element::u8*/}),
+        ::testing::ValuesIn(activations),
+        ::testing::Values("gemm_acl_i8")),
+    FullyConnectedTransformation::getTestCaseName);
+}  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/add_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/add_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/add_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/add_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/assign_and_read_value_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/assign_and_read_value_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/batch_to_space_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/batch_to_space_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/batch_to_space_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/batch_to_space_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/clamp_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/clamp_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_child_and_output.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_child_and_output.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_child_and_output.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_child_and_output.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_different_precision_on_children.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_different_precision_on_children.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_different_precision_on_children.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_different_precision_on_children.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_intermediate_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_intermediate_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_intermediate_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_intermediate_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_neighbors_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_neighbors_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_neighbors_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_neighbors_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_split_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/concat_with_split_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_backprop_data_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_backprop_data_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_backprop_data_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_backprop_data_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_qdq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_qdq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/convolution_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/depth_to_space_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/depth_to_space_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/elementwise_branch_selection_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/elementwise_branch_selection_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/eliminate_fake_quantize_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/eliminate_fake_quantize_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/eliminate_fake_quantize_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/eliminate_fake_quantize_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_avg_pool_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_avg_pool_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_avg_pool_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_avg_pool_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_max_pool_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_max_pool_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_max_pool_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_max_pool_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_two_output_branches_with_convolution.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_two_output_branches_with_convolution.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_and_two_output_branches_with_convolution.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_and_two_output_branches_with_convolution.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_precision_selection_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_precision_selection_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_precision_selection_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_precision_selection_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_with_dq_not_optimal_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_with_dq_not_optimal_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_with_dq_not_optimal_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fq_with_dq_not_optimal_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
similarity index 86%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
index 0368215a5cf5a4..ed9b51d141b59c 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
@@ -44,6 +44,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
         ::testing::Values(ov::test::utils::DEVICE_CPU),
-        ::testing::ValuesIn(trasformationParamValues)),
+        ::testing::ValuesIn(trasformationParamValues),
+        ::testing::ValuesIn({ov::element::i8, ov::element::u8}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::Values("")),
     FullyConnectedTransformation::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_convert_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_convert_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_dequantize_to_fq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_dequantize_to_fq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_dequantize_to_fq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_dequantize_to_fq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_fq_and_scale_shift_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_fq_and_scale_shift_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_fq_and_scale_shift_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_fq_and_scale_shift_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_multiply_to_fq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_multiply_to_fq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_multiply_to_fq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_multiply_to_fq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_subtract_to_fq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_subtract_to_fq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fuse_subtract_to_fq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fuse_subtract_to_fq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/gather_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/gather_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/gather_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/gather_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/gemm_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/gemm_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/group_convolution_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/group_convolution_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/groupconvolution_qdq_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/groupconvolution_qdq_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/groupconvolution_qdq_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/groupconvolution_qdq_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/interpolate_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/interpolate_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_with_constant_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_with_constant_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fq.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_with_optimized_constant_fq.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fq.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mat_mul_with_optimized_constant_fq.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/move_fake_quantize_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/move_fake_quantize_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/move_fake_quantize_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/move_fake_quantize_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_to_group_convolution.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_to_group_convolution.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_with_one_parent.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_with_one_parent.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/multiply_with_one_parent.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/multiply_with_one_parent.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mvn_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/mvn_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/normalize_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/normalize_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers_concat.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers_concat.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers_concat.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers_concat.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers_concat_multi_channel.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers_concat_multi_channel.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/output_layers_concat_multi_channel.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/output_layers_concat_multi_channel.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/pad_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/pad_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/pad_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/pad_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/prelu_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/prelu_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/pull_reshape_through_dequantization.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/pull_reshape_through_dequantization.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/recurrent_cell_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/recurrent_cell_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/recurrent_cell_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/recurrent_cell_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_max_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_max_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_max_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_max_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_mean_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_mean_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_mean_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_mean_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_min_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_min_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_min_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_min_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_sum_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_sum_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reduce_sum_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reduce_sum_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/relu_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/relu_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/relu_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/relu_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reshape_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/reshape_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/shuffle_channels_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/shuffle_channels_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/shuffle_channels_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/shuffle_channels_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/space_to_batch_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/space_to_batch_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/space_to_batch_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/space_to_batch_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/split_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/split_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/split_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/split_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/squeeze_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/squeeze_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/strided_slice_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/strided_slice_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/subtract_multiply_to_multiply_add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/subtract_multiply_to_multiply_add.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/subtract_multiply_to_multiply_add.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/subtract_multiply_to_multiply_add.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/subtract_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/subtract_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/transpose_after_matmul_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/transpose_after_matmul_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/transpose_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/transpose_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/unsqueeze_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/unsqueeze_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/variadic_split_transformation.cpp
similarity index 100%
rename from src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp
rename to src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/variadic_split_transformation.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index dd6cc034cf31b4..7eb30b70bc3f8b 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -479,8 +479,10 @@ std::vector<std::string> disabledTestPatterns() {
     retVector.emplace_back(R"(smoke_TestsDFT_(1|2|3|4)d/DFTLayerTest.Inference.*)");
     // Issue 88764, 91647, 108802: accuracy issue
     retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)");
+#if !defined(OPENVINO_ARCH_ARM64)
     // int8 / code-generation specific
     retVector.emplace_back(R"(smoke_LPT.*)");
+#endif
     // Compressed weights are not supported
     retVector.emplace_back(R"(smoke_MatMulCompressedWeights.*)");
     retVector.emplace_back(R"(smoke_MatMulSharedCompressedWeights.*)");
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
index 71978473696a0b..67f83c29968b51 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
@@ -45,6 +45,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
         ::testing::Values(ov::test::utils::DEVICE_GPU),
-        ::testing::ValuesIn(trasformationParamValues)),
+        ::testing::ValuesIn(trasformationParamValues),
+        ::testing::ValuesIn({ov::element::i8, ov::element::u8})),
     FullyConnectedTransformation::getTestCaseName);
 }  // namespace
diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp
index 731ce44224e33b..232df8f7dd49c4 100644
--- a/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp
+++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp
@@ -8,7 +8,7 @@
 #include <memory>
 #include "shared_test_classes/base/low_precision_transformations/layer_transformation.hpp"
 
-class MatMulShapes {
+class FullyConnectedShapes {
 public:
     ov::PartialShape inputA;
     ov::PartialShape inputB;
@@ -16,11 +16,22 @@ class MatMulShapes {
     bool transposeB;
 };
 
+class FullyConnectedParams {
+public:
+    bool activation;
+    bool perChannelWeights;
+    bool fq;
+    std::string originalLayersNames;
+};
+
 typedef std::tuple<
     ov::element::Type,
-    MatMulShapes,
+    FullyConnectedShapes,
     std::string,
-    ov::pass::low_precision::LayerTransformation::Params> FullyConnectedTransformationParams;
+    ov::pass::low_precision::LayerTransformation::Params,
+    ov::element::Type,
+    FullyConnectedParams,
+    std::string> FullyConnectedTransformationParams;
 
 namespace LayerTestsDefinitions {
 
diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
index 3d394c1e45674c..74dd7450130360 100644
--- a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
+++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
@@ -19,8 +19,10 @@ class MatMulTransformationTestValues {
     ov::builder::subgraph::FakeQuantizeOnData fqOnData1;
     ov::Shape inputShape2;
     ov::builder::subgraph::FakeQuantizeOnData fqOnData2;
+    // TODO: remove, not used
     std::string expectedKernelName;
     std::string expectedRuntimePrecision;
+    bool requantization;
 };
 
 typedef std::tuple<
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
index f72f6d90333613..071a9e3dd9b559 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
@@ -5,38 +5,51 @@
 #include "low_precision_transformations/fully_connected_transformation.hpp"
 
 #include <memory>
+#include <string>
 #include <tuple>
 #include <vector>
-#include <string>
 
 
 #include "common_test_utils/common_utils.hpp"
+#include "openvino/util/common_util.hpp"
 #include "ov_lpt_models/mat_mul.hpp"
 
 namespace LayerTestsDefinitions {
 
 std::string FullyConnectedTransformation::getTestCaseName(const testing::TestParamInfo<FullyConnectedTransformationParams>& obj) {
     ov::element::Type precision;
-    MatMulShapes shapes;
+    FullyConnectedShapes shapes;
     std::string targetDevice;
     ov::pass::low_precision::LayerTransformation::Params params;
-    std::tie(precision, shapes, targetDevice, params) = obj.param;
+    ov::element::Type weightsType;
+    FullyConnectedParams activation;
+    std::string expectedPrimitiveType;
+    std::tie(precision, shapes, targetDevice, params, weightsType, activation, expectedPrimitiveType) = obj.param;
 
     std::ostringstream result;
     result <<
-           get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) <<
-           shapes.inputB << "_" <<
-        shapes.transposeA << "_" <<
-        shapes.transposeB;
+        get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) <<
+        shapes.inputB << "_" <<
+        "transposeA=" << shapes.transposeA << "_" <<
+        "transposeB=" << shapes.transposeB << "_" <<
+        weightsType << "_" <<
+        "Activation=" << activation.activation << "_" <<
+        "perChannelWeights=" << activation.perChannelWeights << "_" <<
+        "FQ=" << activation.fq << "_" <<
+        activation.originalLayersNames << "_" <<
+        expectedPrimitiveType;
 
     return result.str();
 }
 
 void FullyConnectedTransformation::SetUp() {
     ov::element::Type precision;
-    MatMulShapes shapes;
+    FullyConnectedShapes shapes;
     ov::pass::low_precision::LayerTransformation::Params params;
-    std::tie(precision, shapes, targetDevice, params) = this->GetParam();
+    ov::element::Type weightsType;
+    FullyConnectedParams activation;
+    std::string expectedPrimitiveType;
+    std::tie(precision, shapes, targetDevice, params, weightsType, activation, expectedPrimitiveType) = this->GetParam();
 
     init_input_shapes({ shapes.inputA, shapes.inputB });
 
@@ -45,12 +58,28 @@ void FullyConnectedTransformation::SetUp() {
         shapes.inputA,
         shapes.inputB,
         shapes.transposeA,
-        shapes.transposeB);
+        shapes.transposeB,
+        weightsType == ov::element::i8,
+        activation.perChannelWeights,
+        activation.activation,
+        activation.fq);
 }
 
 TEST_P(FullyConnectedTransformation, CompareWithRefImpl) {
     SKIP_IF_CURRENT_TEST_IS_DISABLED();
     run();
+
+    const auto& activation = std::get<5>(GetParam());
+    const auto originalLayersNames = get_property_by_type("FullyConnected", "originalLayersNames");
+    EXPECT_EQ(ov::util::to_lower(activation.originalLayersNames), originalLayersNames);
+
+    const auto& actualPrecision = get_runtime_precision_by_type("FullyConnected");
+    const auto expectedPrecision = std::get<4>(GetParam());
+    EXPECT_EQ(actualPrecision, expectedPrecision.to_string());
+
+    const auto& expectedPrimitiveType = std::get<6>(GetParam());
+    const std::string actualPrimitiveType = get_property_by_type("FullyConnected", "primitiveType");
+    EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType);
 };
 
 }  // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
index bc1ce628deb245..d1d4ea94c6b01b 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
@@ -27,10 +27,11 @@ std::string MatMulTransformation::getTestCaseName(const testing::TestParamInfo<M
     result <<
         precision << "_" <<
         targetDevice << "_" <<
-        testValues.inputShape1 << "_" <<
+        "IN1=" << testValues.inputShape1 << "_" <<
         testValues.fqOnData1 << "_" <<
-        testValues.inputShape2 << "_" <<
-        testValues.fqOnData2;
+        "IN2=" << testValues.inputShape2 << "_" <<
+        testValues.fqOnData2 << "_" <<
+        testValues.requantization;
 
     return result.str();
 }
@@ -45,11 +46,12 @@ void MatMulTransformation::SetUp() {
     init_input_shapes({ testValues.inputShape1, testValues.inputShape2 });
 
     function = ov::builder::subgraph::MatMulFunction::getOriginal(
-        precision,
+        ov::element::f16, // precision,
         testValues.inputShape1,
         testValues.fqOnData1,
         testValues.inputShape2,
-        testValues.fqOnData2);
+        testValues.fqOnData2,
+        testValues.requantization);
 
     ov::pass::InitNodeInfo().run_on_model(function);
 }
@@ -59,10 +61,13 @@ void MatMulTransformation::run() {
 
     LayerTransformation::run();
 
-    const auto params = std::get<3>(GetParam());
-    const auto actualType = get_runtime_precision(params.expectedKernelName);
+    const auto& actualType = get_runtime_precision_by_type("MatMul");
+    const auto expected = std::get<3>(GetParam());
+    EXPECT_EQ(expected.expectedRuntimePrecision, actualType);
 
-    EXPECT_EQ(actualType, params.expectedRuntimePrecision);
+    const auto& actualPrimitiveType = get_property_by_type("MatMul", "primitiveType");
+    const auto expectedPrimitiveType = "gemm_acl_i8";
+    EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType);
 }
 
 TEST_P(MatMulTransformation, CompareWithRefImpl) {
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp
index 10a70f3bc04ee0..b9da9ff8af4833 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp
@@ -49,6 +49,8 @@ class LayerTransformation : virtual public ov::test::SubgraphBaseTest {
     // get runtime precision by operation type
     std::string get_runtime_precision_by_type(const std::string& layerType);
 
+    std::string get_property_by_type(const std::string& layerTypeName, const std::string& propertyName);
+
     // get runtime precision by operation friendly name which can be fused
     std::string get_runtime_precision_by_fused_name(const std::string& layerName);
 
diff --git a/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp b/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
index 49e7b0581cae76..0a5e59b9f23e66 100644
--- a/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
@@ -7,6 +7,7 @@
 #include <vector>
 #include <string>
 
+#include "openvino/util/common_util.hpp"
 
 namespace LayerTestsUtils {
 ov::pass::low_precision::LayerTransformation::Params LayerTransformationParamsNGraphFactory::createParamsU8I8AndI8() {
@@ -60,14 +61,14 @@ std::string LayerTransformation::get_test_case_name_by_params(
 
 namespace {
 template <typename IsNodeF>
-std::string find_node_by_runtime_precision(const ov::CompiledModel& execNet, IsNodeF is_node_f) {
+std::string find_node_by_runtime_precision(const ov::CompiledModel& execNet, IsNodeF is_node_f, const std::string& propertyName = "runtimePrecision") {
     const std::shared_ptr<const ov::Model>& execFunction = execNet.get_runtime_model();
 
     for (const auto& op : execFunction->get_ops()) {
         if (!is_node_f(op))
             continue;
         const ov::RTMap& rtInfo = op->get_rt_info();
-        const auto& it = rtInfo.find("runtimePrecision");
+        const auto& it = rtInfo.find(propertyName);
         OPENVINO_ASSERT(it != rtInfo.end(), "Runtime precision is not found for node: ", op->get_friendly_name());
         return it->second.as<std::string>();
     }
@@ -94,6 +95,17 @@ std::string LayerTransformation::get_runtime_precision_by_type(const std::string
     return find_node_by_runtime_precision(compiledModel, is_node_f);
 }
 
+std::string LayerTransformation::get_property_by_type(const std::string& layerTypeName, const std::string& propertyName) {
+    auto is_node_f = [&layerTypeName](const std::shared_ptr<ov::Node>& op) {
+        const auto& rtInfo = op->get_rt_info();
+        const auto& typeIt = rtInfo.find("layerType");
+
+        OPENVINO_ASSERT(typeIt != rtInfo.end(), "Layer is not found for type: ", layerTypeName);
+        return typeIt->second.as<std::string>() == layerTypeName;
+    };
+    return ov::util::to_lower(find_node_by_runtime_precision(compiledModel, is_node_f, propertyName));
+}
+
 namespace {
 bool has_layer(const std::string& names, const std::string& layer_name) {
     size_t beginPosition = 0ul;
diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
index 787e1f6ebe8bd4..b4a304a58dd5d0 100644
--- a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
+++ b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
@@ -24,17 +24,22 @@ class MatMulFunction {
 
     static std::shared_ptr<ov::Model> getOriginal(
         const ov::element::Type precision,
-        const ov::PartialShape inputShape1,
-        const ov::PartialShape inputShape2,
+        const ov::PartialShape& inputShape1,
+        const ov::PartialShape& inputShape2,
         const bool transpose1,
-        const bool transpose2);
+        const bool transpose2,
+        const bool signedWeights,
+        const bool perChannelWeights,
+        const bool relu,
+        const bool fq);
 
     static std::shared_ptr<ov::Model> getOriginal(
         const ov::element::Type precision,
         const ov::Shape& inputShape1,
         const FakeQuantizeOnData& fqOnData1,
         const ov::Shape& inputShape2,
-        const FakeQuantizeOnData& fqOnData2);
+        const FakeQuantizeOnData& fqOnData2,
+        const bool requantization = false);
 
     static std::shared_ptr<ov::Model> getOriginal(const ov::element::Type netPrecision,
                                                   const ov::PartialShape& inputShape1,
diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
index 1b1351ef1b3399..e4a44b4b6a70ed 100644
--- a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
+++ b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
@@ -49,36 +49,108 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
     return function;
 }
 
+namespace {
+template <typename T>
+std::vector<T> generate_values(const ov::Shape& shape, float delimiter = 1.f) {
+    std::vector<T> values(ov::shape_size(shape));
+    for (size_t i = 0; i < values.size(); ++i) {
+        values[i] = static_cast<T>(static_cast<T>(i) / delimiter);
+    }
+    return values;
+}
+
+std::vector<float> generate_dequantization_values(
+        const ov::Shape& shape,
+        const size_t levels,
+        const bool low) {
+    const auto shape_size = ov::shape_size(shape);
+    std::vector<float> values(shape_size);
+    for (size_t i = 0; i < shape_size; ++i) {
+        values[i] = low ? -128.f / (static_cast<float>(i) + 1.f) : 127.f / (static_cast<float>(i) + 1.f);
+    }
+    return values;
+}
+} // namespace
+
 std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
-    const ov::element::Type precision,
-    const ov::PartialShape inputShape1,
-    const ov::PartialShape inputShape2,
-    const bool transpose1,
-    const bool transpose2) {
+        const ov::element::Type precision,
+        const ov::PartialShape& inputShape1,
+        const ov::PartialShape& inputShape2,
+        const bool transpose1,
+        const bool transpose2,
+        const bool signedOnWeights,
+        const bool perChannelWeights,
+        const bool relu,
+        const bool fq) {
     const auto paramNode = std::make_shared<ov::opset1::Parameter>(precision, inputShape1);
     const std::vector<size_t> constShapes(inputShape1.rank().get_length(), 1ul);
-    const auto fakeQuantizeOnAcitvations = ov::test::utils::make_fake_quantize(
-        paramNode, precision, 256ul, constShapes,
-        { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
+    const auto fakeQuantizeOnAcitvations = signedOnWeights ?
+            ov::test::utils::make_fake_quantize(
+                paramNode, precision, 256ul, constShapes,
+                { -128.f / 4.f }, { 127.f / 4.f }, { -128.f / 4.f }, { 127.f / 4.f }) :
+            ov::test::utils::make_fake_quantize(
+                paramNode, precision, 256ul, constShapes,
+                { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
     fakeQuantizeOnAcitvations->set_friendly_name("fakeQuantizeOnAcitvations");
 
-    auto weightsConst = std::make_shared<ov::op::v0::Constant>(
-        precision,
-        inputShape2.to_shape(),
-        std::vector<float>({ 1.f }));
-    const auto fakeQuantizeOnWeights = ov::test::utils::make_fake_quantize(
-        weightsConst, precision, 256ul, { 1ul, 1ul },
-        { -128.f / 8.f }, { 127.f / 8.f }, { -128.f / 8.f }, { 127.f / 8.f });
-    fakeQuantizeOnWeights->set_friendly_name("fakeQuantizeOnWeights");
+    const size_t channel = inputShape2[inputShape2.size() - 2].get_length();
+
+    // fq
+    std::shared_ptr<Node> parentOnWeights;
+    if (fq) {
+        auto weightsConst = std::make_shared<ov::op::v0::Constant>(
+                precision,
+                inputShape2.to_shape(),
+                generate_values<float>(inputShape2.to_shape(), 10.f));
+
+        parentOnWeights = perChannelWeights ?
+                          ov::test::utils::make_fake_quantize(
+                                  weightsConst, precision, 256ul,
+                                  Shape{channel, 1},
+                                  generate_dequantization_values(Shape{channel, 1}, 256ul, true),
+                                  generate_dequantization_values(Shape{channel, 1}, 256ul, false),
+                                  generate_dequantization_values(Shape{channel, 1}, 256ul, true),
+                                  generate_dequantization_values(Shape{channel, 1}, 256ul, false)) :
+                          ov::test::utils::make_fake_quantize(
+                                  weightsConst, precision, 256ul, {1ul, 1ul},
+                                  {-128.f / 8.f}, {127.f / 8.f}, {-128.f / 8.f}, {127.f / 8.f});
+    } else {
+        Shape shape = inputShape2.to_shape();
+        if (transpose2) {
+            shape[shape.size() - 1ull] = 1;
+        } else {
+            shape[shape.size() - 2ull] = 1;
+        }
+
+        auto weightsConst = std::make_shared<ov::op::v0::Constant>(
+                signedOnWeights ? element::i8 : element::u8,
+                inputShape2.to_shape(),
+                generate_values<int8_t>(inputShape2.to_shape()));
+
+        const auto convert = std::make_shared<opset1::Convert>(weightsConst, precision);
+
+        const auto multiplyConst = std::make_shared<ov::op::v0::Constant>(
+                precision,
+                shape,
+                generate_values<float>(shape));
+        parentOnWeights = std::make_shared<opset1::Multiply>(convert, multiplyConst);
+    }
 
-    const std::shared_ptr<ov::opset1::MatMul> fullyConnected = std::make_shared<ov::opset1::MatMul>(
+    parentOnWeights->set_friendly_name("fakeQuantizeOnWeights");
+
+    std::shared_ptr<Node> parent = std::make_shared<ov::opset1::MatMul>(
         fakeQuantizeOnAcitvations->output(0),
-        fakeQuantizeOnWeights->output(0),
+        parentOnWeights->output(0),
         transpose1,
         transpose2);
-    fullyConnected->set_friendly_name("fullyConnected");
+    parent->set_friendly_name("fullyConnected");
 
-    ov::ResultVector results{ std::make_shared<ov::opset1::Result>(fullyConnected) };
+    if (relu) {
+        parent = std::make_shared<ov::opset1::Relu>(parent);
+        parent->set_friendly_name("relu");
+    }
+
+    ov::ResultVector results{ std::make_shared<ov::opset1::Result>(parent) };
     std::shared_ptr<ov::Model> function = std::make_shared<ov::Model>(
         results,
         ov::ParameterVector{ paramNode },
@@ -93,21 +165,40 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
     const ov::Shape& inputShape1,
     const FakeQuantizeOnData& fqOnData1,
     const ov::Shape& inputShape2,
-    const FakeQuantizeOnData& fqOnData2) {
+    const FakeQuantizeOnData& fqOnData2,
+    const bool requantization) {
     const std::shared_ptr<ov::opset1::Parameter> input1 = std::make_shared<ov::opset1::Parameter>(precision, inputShape1);
     input1->set_friendly_name("input1");
 
     const std::shared_ptr<ov::opset1::Parameter> input2 = std::make_shared<ov::opset1::Parameter>(precision, inputShape2);
     input2->set_friendly_name("input2");
 
-    const std::shared_ptr<ov::opset1::MatMul> matMul = std::make_shared<ov::opset1::MatMul>(
-        makeFakeQuantize(input1, precision, fqOnData1),
-        makeFakeQuantize(input2, precision, fqOnData2),
+    std::shared_ptr<ov::Node> parent1 = input1;
+    if (!fqOnData1.empty()) {
+        parent1 = makeFakeQuantize(parent1, precision, fqOnData1);
+    }
+
+    std::shared_ptr<ov::Node> parent2 = input2;
+    if (!fqOnData2.empty()) {
+        parent2 = makeFakeQuantize(parent2, precision, fqOnData2);
+    }
+
+    std::shared_ptr<Node> parent = std::make_shared<ov::opset1::MatMul>(
+        parent1,
+        parent2,
         false,
         false);
-    matMul->set_friendly_name("matMul");
+    parent->set_friendly_name("matMul");
+
+    if (requantization) {
+        parent = makeFakeQuantize(parent, precision, fqOnData1);
+        parent = std::make_shared<ov::opset1::PRelu>(
+                parent,
+                std::make_shared<ov::opset1::Constant>(ov::element::f32, Shape{1}, std::vector<float>{0.f}));
+        parent->set_friendly_name("prelu");
+    }
 
-    std::shared_ptr<ov::opset1::Result> result = std::make_shared<ov::opset1::Result>(matMul);
+    std::shared_ptr<ov::opset1::Result> result = std::make_shared<ov::opset1::Result>(parent);
 
     std::shared_ptr<ov::Model> function = std::make_shared<ov::Model>(
         ov::ResultVector{ result },

From 7dd226e16142a377439bedf972d01ddee1ddd747 Mon Sep 17 00:00:00 2001
From: Edward Shogulin <edward.shogulin@intel.com>
Date: Tue, 13 Aug 2024 17:41:43 +0100
Subject: [PATCH 2/5] [CPU] [ACL] FullyConnected fp32 executor refactoring

---
 .../executors/acl/acl_fullyconnected.cpp      | 249 +--------------
 .../executors/acl/acl_fullyconnected.hpp      |  58 +---
 .../executors/acl/acl_lowp_fullyconnected.cpp |  23 +-
 .../src/nodes/executors/acl/acl_weights.cpp   | 300 ++++++++++++++++++
 .../src/nodes/executors/acl/acl_weights.hpp   |  82 +++++
 .../intel_cpu/tests/functional/CMakeLists.txt |   2 +-
 .../x64/fully_connected_transformation.cpp    |  27 +-
 .../fully_connected_transformation.cpp        |  29 +-
 .../fully_connected_transformation.cpp        |  12 +-
 9 files changed, 455 insertions(+), 327 deletions(-)
 create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
 create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
index 6f464abf33d036..c5f88803a252ad 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -15,146 +15,6 @@
 namespace ov {
 namespace intel_cpu {
 
-static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
-    const auto& weightDims = wShape.getStaticDims();
-
-    auto inMinDims = inShape.getMinDims();
-    auto inMaxDims = inShape.getMaxDims();
-    inMinDims.back() = weightDims.back();
-    inMaxDims.back() = weightDims.back();
-
-    return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
-}
-
-static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
-    size_t activationRank = inShape.size();
-    size_t channelRank = wShape.size() - 1;
-    // activation   weight    output_shape
-    // NCHW         CoCHW     NCo
-    // TNC          CoC       TNCo
-    // NC           CoC       NCo
-    VectorDims outputShape(out_rank, 1);
-    // set Co
-    outputShape.back() = wShape[0];
-    // set batch dims
-    size_t batchRank = activationRank - channelRank;
-    size_t startIdx = out_rank - batchRank - 1;
-    for (size_t i = 0; i < batchRank; i++) {
-        outputShape[i + startIdx] = inShape[i];
-    }
-
-    return outputShape;
-}
-
-static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
-                                     const ExecutorContext::CPtr context,
-                                     const FCAttrs &attrs,
-                                     const ACLFCAttrs& aclfcAttrs,
-                                     const PostOps &postOps) {
-    DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights");
-    const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
-    const auto N = wgtDims[0];
-    const auto K = wgtDims[1];
-
-    auto create = [&]() {
-        MemoryPtr final_ptr = memory.at(ARG_WEI);
-        // Convert weights precision
-        if (aclfcAttrs.isConvertedWeights) {
-            MemoryArgs memoryArgs;
-            memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI);
-            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                           memoryArgs[ARG_SRC_0]->getDescPtr()->cloneWithNewPrecision(
-                                                                   aclfcAttrs.inputPrecision));
-            auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
-            if (aclWeightsConverter->update(memoryArgs)) {
-                aclWeightsConverter->execute(memoryArgs);
-            } else {
-                auto count_wei_elem = std::accumulate(memoryArgs[ARG_SRC_0]->getStaticDims().begin(),
-                                                      memoryArgs[ARG_SRC_0]->getStaticDims().end(),
-                                                      1,
-                                                      std::multiplies<>());
-                cpu_convert(memoryArgs[ARG_SRC_0]->getData(),
-                            memoryArgs[ARG_DST]->getData(),
-                            memoryArgs[ARG_SRC_0]->getPrecision(),
-                            memoryArgs[ARG_DST]->getPrecision(),
-                            count_wei_elem);
-            }
-            final_ptr = memoryArgs[ARG_DST];
-        }
-        // Packed weights
-        {
-            arm_compute::WeightFormat expectedWeightFormat;
-            bool isNeededReorder;
-            {
-                MemoryArgs memoryArgs;
-                memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
-                memoryArgs[ARG_WEI]   = final_ptr;
-                if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
-                    const auto& inShape = memory.at(ARG_SRC_0)->getShape();
-                    const auto& wShape = final_ptr->getShape();
-                    const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
-                    const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
-                    memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
-                                                                     memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
-                    memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                                   memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
-                } else {
-                    memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
-                    memoryArgs[ARG_DST]   = memory.at(ARG_DST);
-                }
-                auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
-                isNeededReorder = aclWeightsRepack->update(memoryArgs);
-                expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat();
-            }
-            if (isNeededReorder) {
-                MemoryArgs memoryArgs;
-                memoryArgs[ARG_SRC_0] = final_ptr;
-                memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                               memoryArgs[ARG_SRC_0]->getDescPtr()->clone());
-                auto aclWeightsReorder = std::make_shared<acl_fc_executor::ACLWeightsReorder>(
-                        arm_compute::WeightFormat::OHWI, expectedWeightFormat);
-                if (aclWeightsReorder->update(memoryArgs)) {
-                    aclWeightsReorder->execute(memoryArgs);
-                    final_ptr = memoryArgs[ARG_DST];
-                }
-            }
-        }
-        // Transpose weights
-        if (!aclfcAttrs.weightsNonTransposed) {
-            auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims();
-            if (reverse_weights_dims.size() == 3) {
-                reverse_weights_dims = VectorDims(
-                        {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]});
-            }
-            std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end());
-            MemoryArgs memoryArgs;
-            memoryArgs[ARG_SRC_0] = final_ptr;
-            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                           CpuBlockedMemoryDesc(final_ptr->getPrecision(),
-                                                                                intel_cpu::Shape(reverse_weights_dims)));
-            auto aclWeightsTranspose = std::make_shared<acl_fc_executor::ACLWeightsTranspose>();
-            if (aclWeightsTranspose->update(memoryArgs)) {
-                aclWeightsTranspose->execute(memoryArgs);
-                final_ptr = memoryArgs[ARG_DST];
-            }
-        }
-        DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
-        return final_ptr;
-    };
-
-    auto weightCache = context->getWeightsCache();
-    if (weightCache != nullptr) {
-        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
-        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
-                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
-        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
-        return *weightCache->findOrCreate(string_hash, create);
-    }
-
-    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
-    return create();
-}
-
 static bool checkPostOps(const PostOps &postOps) {
     if (postOps.empty()) {
         return true;
@@ -199,7 +59,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs,
                                                      const MemoryArgs &memory,
                                                      const ExecutorContext::CPtr context) {
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-    packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
 }
 
 bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
@@ -212,30 +72,8 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
     return true;
 }
 
-static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) {
-    if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) {
-        aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape(
-                {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1],
-                 aclMemoryShapes[ACLArgs::ACL_WEI][2]});
-    }
-
-    if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) {
-        aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({
-            aclMemoryShapes[ACLArgs::ACL_WEI][0],
-            aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]});
-    }
-
-    if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) {
-        aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({
-            aclMemoryShapes[ACLArgs::ACL_WEI][1],
-            aclMemoryShapes[ACLArgs::ACL_SRC_0][1]});
-    }
-
-    std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
-}
-
 void ACLFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
+    acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
 }
 
 arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) {
@@ -268,88 +106,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclM
     return neFC;
 }
 
-arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-    return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                         aclMemoryInfos[ACLArgs::ACL_DST].get(),
-                                         arm_compute::ConvertPolicy::SATURATE);
-}
-
-ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) {
-    auto neCast = std::make_unique<arm_compute::NECast>();
-    neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
-                      arm_compute::ConvertPolicy::SATURATE);
-    return neCast;
-}
-
-
-arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-    return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                              aclMemoryInfos[ACLArgs::ACL_DST].get());
-}
-
-ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) {
-    auto neTranspose = std::make_unique<arm_compute::NETranspose>();
-    neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                           aclMemoryTensors[ACLArgs::ACL_DST].get());
-    return neTranspose;
-}
-
-acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs,
-                                                                    const PostOps &postOps,
-                                                                    const MemoryArgs &memory) {
-    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-}
-
-void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
-}
-
-arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-    if (aclfcAttrs.isConvertedWeights) {
-        aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type());
-    }
-    return arm_compute::NEFullyConnectedLayer::has_opt_impl(
-            expectedWeightFormat,
-            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
-            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
-            aclMemoryInfos[ACLArgs::ACL_DST].get(),
-            fullyConnectedLayerInfo,
-            weightsInfo);
-}
-
-ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) {
-    return std::make_unique<arm_compute::NEFullyConnectedLayer>();
-}
-
-arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-#if defined(OPENVINO_ARCH_ARM64)
-    return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                                 aclMemoryInfos[ACLArgs::ACL_DST].get(),
-                                                 inWeightFormat,
-                                                 outWeightFormat);
-#else
-    return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                         aclMemoryInfos[ACLArgs::ACL_DST].get());
-#endif
-}
-
-ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) {
-#if defined(OPENVINO_ARCH_ARM64)
-    auto neReorderLayer = std::make_unique<arm_compute::NEReorderLayer>();
-    neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                              aclMemoryTensors[ACLArgs::ACL_DST].get(),
-                              inWeightFormat,
-                              outWeightFormat);
-    return neReorderLayer;
-#else
-    auto neCopy = std::make_unique<arm_compute::NECopy>();
-    neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                              aclMemoryTensors[ACLArgs::ACL_DST].get());
-    return neCopy;
-#endif
-}
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
index 4d7f2e5ef91480..4fb4703ba2742b 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -6,67 +6,11 @@
 
 #include "acl_common_executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
+#include "acl_weights.hpp"
 
 namespace ov {
 namespace intel_cpu {
 
-struct ACLFCAttrs {
-    ov::element::Type inputPrecision;
-    bool isConvertedWeights = false;
-    bool weightsNonTransposed;
-};
-
-namespace acl_fc_executor {
-
-class ACLWeightsConverter : public ACLCommonExecutor {
-public:
-    ACLWeightsConverter() = default;
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-};
-
-class ACLWeightsTranspose : public ACLCommonExecutor {
-public:
-    ACLWeightsTranspose() = default;
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-};
-
-class ACLWeightFormatGenerator : public ACLCommonExecutor {
-public:
-    ACLWeightFormatGenerator(const FCAttrs& attrs,
-                             const PostOps& postOps,
-                             const MemoryArgs& memory);
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-    arm_compute::WeightFormat getOptImplWeightFormat() {
-        return expectedWeightFormat;
-    }
-private:
-    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
-    arm_compute::WeightsInfo weightsInfo;
-    ACLFCAttrs aclfcAttrs;
-    arm_compute::WeightFormat expectedWeightFormat;
-};
-
-class ACLWeightsReorder : public ACLCommonExecutor {
-public:
-    ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat,
-                      arm_compute::WeightFormat outWeightFormat)
-                      : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {}
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-private:
-    arm_compute::WeightFormat inWeightFormat;
-    arm_compute::WeightFormat outWeightFormat;
-};
-
-}  // namespace acl_fc_executor
-
 class ACLFullyConnectedExecutor : public ACLCommonExecutor {
 public:
     ACLFullyConnectedExecutor(const FCAttrs& attrs,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
index 9a9a2832063be9..eb8af61898754d 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -15,6 +15,9 @@
 #include "acl_weights.hpp"
 #include "acl_utils.hpp"
 
+#include "nodes/common/cpu_convert.h"
+#include "memory_desc/cpu_memory_desc_utils.h"
+
 namespace ov {
 namespace intel_cpu {
 
@@ -31,20 +34,16 @@ static bool checkPostOps(const PostOps &postOps) {
     return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()));
 }
 
-void initFCAttrs(const FCAttrs &attrs,
-                 ACLTensorAttrs& aclTensorAttrs,
-                 ACLFCAttrs& aclfcAttrs,
-                 const MemoryArgs &memory,
-                 arm_compute::GEMMInfo& gemmInfo,
-                 const PostOps &postOps) {
+static void initFCAttrs(const FCAttrs &attrs,
+                        ACLTensorAttrs& aclTensorAttrs,
+                        ACLFCAttrs& aclfcAttrs,
+                        const MemoryArgs &memory,
+                        arm_compute::GEMMInfo& fullyConnectedLayerInfo,
+                        const PostOps &postOps) {
     aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
-    // TODO: not completed
     //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
     aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
-    // TODO: not completed
     //fullyConnectedLayerInfo.transpose_weights = false;
-    gemmInfo.set_pretranspose_A(false);
-    gemmInfo.set_pretranspose_B(false);
     aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
 
     if (!postOps.empty()) {
@@ -64,7 +63,7 @@ ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attr
                                                              const MemoryArgs &memory,
                                                              const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) {
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
-    packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
 }
 
 bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
@@ -83,7 +82,7 @@ bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
 }
 
 void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
+    acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
 }
 
 arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
new file mode 100644
index 00000000000000..819be7ef057dce
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
@@ -0,0 +1,300 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_fullyconnected.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/implementation_utils.hpp"
+#include "nodes/common/cpu_convert.h"
+#include "memory_desc/cpu_memory_desc_utils.h"
+
+namespace ov {
+namespace intel_cpu {
+
+VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
+    const auto& weightDims = wShape.getStaticDims();
+
+    auto inMinDims = inShape.getMinDims();
+    auto inMaxDims = inShape.getMaxDims();
+    inMinDims.back() = weightDims.back();
+    inMaxDims.back() = weightDims.back();
+
+    return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
+}
+
+VectorDims acl_fc_executor::makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
+    size_t activationRank = inShape.size();
+    size_t channelRank = wShape.size() - 1;
+    // activation   weight    output_shape
+    // NCHW         CoCHW     NCo
+    // TNC          CoC       TNCo
+    // NC           CoC       NCo
+    VectorDims outputShape(out_rank, 1);
+    // set Co
+    outputShape.back() = wShape[0];
+    // set batch dims
+    size_t batchRank = activationRank - channelRank;
+    size_t startIdx = out_rank - batchRank - 1;
+    for (size_t i = 0; i < batchRank; i++) {
+        outputShape[i + startIdx] = inShape[i];
+    }
+
+    return outputShape;
+}
+
+MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs &memory,
+                                     const ExecutorContext::CPtr context,
+                                     const FCAttrs &attrs,
+                                     const ACLFCAttrs& aclfcAttrs,
+                                     const PostOps &postOps) {
+    DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights");
+    const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
+    const auto N = wgtDims[0];
+    const auto K = wgtDims[1];
+
+    auto create = [&]() {
+        MemoryPtr final_ptr = memory.at(ARG_WEI);
+        // Convert weights precision
+        if (aclfcAttrs.isConvertedWeights) {
+            MemoryArgs memoryArgs;
+            memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI);
+            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                           memoryArgs[ARG_SRC_0]->getDescPtr()->cloneWithNewPrecision(
+                                                                   aclfcAttrs.inputPrecision));
+            auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
+            if (aclWeightsConverter->update(memoryArgs)) {
+                aclWeightsConverter->execute(memoryArgs);
+            } else {
+                auto count_wei_elem = std::accumulate(memoryArgs[ARG_SRC_0]->getStaticDims().begin(),
+                                                      memoryArgs[ARG_SRC_0]->getStaticDims().end(),
+                                                      1,
+                                                      std::multiplies<>());
+                cpu_convert(memoryArgs[ARG_SRC_0]->getData(),
+                            memoryArgs[ARG_DST]->getData(),
+                            memoryArgs[ARG_SRC_0]->getPrecision(),
+                            memoryArgs[ARG_DST]->getPrecision(),
+                            count_wei_elem);
+            }
+            final_ptr = memoryArgs[ARG_DST];
+        }
+        // Packed weights
+        {
+            arm_compute::WeightFormat expectedWeightFormat;
+            bool isNeededReorder;
+            {
+                MemoryArgs memoryArgs;
+                memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
+                memoryArgs[ARG_WEI]   = final_ptr;
+                if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
+                    const auto& inShape = memory.at(ARG_SRC_0)->getShape();
+                    const auto& wShape = final_ptr->getShape();
+                    const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
+                    const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
+                    memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
+                                                                     memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
+                    memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                                   memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
+                } else {
+                    memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
+                    memoryArgs[ARG_DST]   = memory.at(ARG_DST);
+                }
+                auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
+                isNeededReorder = aclWeightsRepack->update(memoryArgs);
+                expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat();
+            }
+            if (isNeededReorder) {
+                MemoryArgs memoryArgs;
+                memoryArgs[ARG_SRC_0] = final_ptr;
+                memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                               memoryArgs[ARG_SRC_0]->getDescPtr()->clone());
+                auto aclWeightsReorder = std::make_shared<acl_fc_executor::ACLWeightsReorder>(
+                        arm_compute::WeightFormat::OHWI, expectedWeightFormat);
+                if (aclWeightsReorder->update(memoryArgs)) {
+                    aclWeightsReorder->execute(memoryArgs);
+                    final_ptr = memoryArgs[ARG_DST];
+                }
+            }
+        }
+        // Transpose weights
+        if (!aclfcAttrs.weightsNonTransposed) {
+            auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims();
+            if (reverse_weights_dims.size() == 3) {
+                reverse_weights_dims = VectorDims(
+                        {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]});
+            }
+            std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end());
+            MemoryArgs memoryArgs;
+            memoryArgs[ARG_SRC_0] = final_ptr;
+            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                           CpuBlockedMemoryDesc(final_ptr->getPrecision(),
+                                                                                intel_cpu::Shape(reverse_weights_dims)));
+            auto aclWeightsTranspose = std::make_shared<acl_fc_executor::ACLWeightsTranspose>();
+            if (aclWeightsTranspose->update(memoryArgs)) {
+                aclWeightsTranspose->execute(memoryArgs);
+                final_ptr = memoryArgs[ARG_DST];
+            }
+        }
+        DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
+        return final_ptr;
+    };
+
+    auto weightCache = context->getWeightsCache();
+    if (weightCache != nullptr) {
+        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
+        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
+                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
+        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
+        return *weightCache->findOrCreate(string_hash, create);
+    }
+
+    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
+    return create();
+}
+
+static bool checkPostOps(const PostOps &postOps) {
+    // Add postops
+    if (!postOps.empty() && postOps.size() == 1) {
+        if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
+            if (checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()))) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static void initFCAttrs(const FCAttrs &attrs,
+                        ACLTensorAttrs& aclTensorAttrs,
+                        ACLFCAttrs& aclfcAttrs,
+                        const MemoryArgs &memory,
+                        arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo,
+                        const PostOps &postOps) {
+    aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+    fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
+    aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
+    fullyConnectedLayerInfo.transpose_weights = false;
+    aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
+
+    if (checkPostOps(postOps)) {
+        auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0]);
+        fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(
+                convertToEltwiseAlgorithm(activation->type()),
+                activation->alpha(), activation->beta(), activation->gamma());
+    }
+
+    if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
+        aclfcAttrs.isConvertedWeights = true;
+    }
+}
+
+void acl_fc_executor::updateFCTensorsShapes(ACLShapes& aclMemoryShapes) {
+    if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) {
+        aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape(
+                {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1],
+                 aclMemoryShapes[ACLArgs::ACL_WEI][2]});
+    }
+
+    if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) {
+        aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({
+            aclMemoryShapes[ACLArgs::ACL_WEI][0],
+            aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]});
+    }
+
+    if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) {
+        aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({
+            aclMemoryShapes[ACLArgs::ACL_WEI][1],
+            aclMemoryShapes[ACLArgs::ACL_SRC_0][1]});
+    }
+
+    std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                         aclMemoryInfos[ACLArgs::ACL_DST].get(),
+                                         arm_compute::ConvertPolicy::SATURATE);
+}
+
+ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) {
+    auto neCast = std::make_unique<arm_compute::NECast>();
+    neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
+                      arm_compute::ConvertPolicy::SATURATE);
+    return neCast;
+}
+
+
+arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                              aclMemoryInfos[ACLArgs::ACL_DST].get());
+}
+
+ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) {
+    auto neTranspose = std::make_unique<arm_compute::NETranspose>();
+    neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                           aclMemoryTensors[ACLArgs::ACL_DST].get());
+    return neTranspose;
+}
+
+acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs,
+                                                                    const PostOps &postOps,
+                                                                    const MemoryArgs &memory) {
+    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
+}
+
+void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) {
+    updateFCTensorsShapes(aclMemoryShapes);
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    if (aclfcAttrs.isConvertedWeights) {
+        aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type());
+    }
+    return arm_compute::NEFullyConnectedLayer::has_opt_impl(
+            expectedWeightFormat,
+            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_DST].get(),
+            fullyConnectedLayerInfo,
+            weightsInfo);
+}
+
+ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) {
+    return std::make_unique<arm_compute::NEFullyConnectedLayer>();
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+#if defined(OPENVINO_ARCH_ARM64)
+    return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                                 aclMemoryInfos[ACLArgs::ACL_DST].get(),
+                                                 inWeightFormat,
+                                                 outWeightFormat);
+#else
+    return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                         aclMemoryInfos[ACLArgs::ACL_DST].get());
+#endif
+}
+
+ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) {
+#if defined(OPENVINO_ARCH_ARM64)
+    auto neReorderLayer = std::make_unique<arm_compute::NEReorderLayer>();
+    neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                              aclMemoryTensors[ACLArgs::ACL_DST].get(),
+                              inWeightFormat,
+                              outWeightFormat);
+    return neReorderLayer;
+#else
+    auto neCopy = std::make_unique<arm_compute::NECopy>();
+    neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                              aclMemoryTensors[ACLArgs::ACL_DST].get());
+    return neCopy;
+#endif
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp
new file mode 100644
index 00000000000000..6f558ecf053bca
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "acl_common_executor.hpp"
+#include "nodes/executors/fullyconnected_config.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+struct ACLFCAttrs {
+    ov::element::Type inputPrecision;
+    bool isConvertedWeights = false;
+    bool weightsNonTransposed;
+};
+
+namespace acl_fc_executor {
+
+VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape);
+
+VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank);
+
+MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
+                              const ExecutorContext::CPtr context,
+                              const FCAttrs &attrs,
+                              const ACLFCAttrs& aclfcAttrs,
+                              const PostOps &postOps);
+
+void updateFCTensorsShapes(ACLShapes& aclMemoryShapes);
+
+class ACLWeightsConverter : public ACLCommonExecutor {
+public:
+    ACLWeightsConverter() = default;
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+};
+
+class ACLWeightsTranspose : public ACLCommonExecutor {
+public:
+    ACLWeightsTranspose() = default;
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+};
+
+class ACLWeightFormatGenerator : public ACLCommonExecutor {
+public:
+    ACLWeightFormatGenerator(const FCAttrs& attrs,
+                             const PostOps& postOps,
+                             const MemoryArgs& memory);
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+    arm_compute::WeightFormat getOptImplWeightFormat() {
+        return expectedWeightFormat;
+    }
+private:
+    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
+    arm_compute::WeightsInfo weightsInfo;
+    ACLFCAttrs aclfcAttrs;
+    arm_compute::WeightFormat expectedWeightFormat;
+};
+
+class ACLWeightsReorder : public ACLCommonExecutor {
+public:
+    ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat,
+                      arm_compute::WeightFormat outWeightFormat)
+                      : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {}
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+private:
+    arm_compute::WeightFormat inWeightFormat;
+    arm_compute::WeightFormat outWeightFormat;
+};
+
+}  // namespace acl_fc_executor
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
index c5ab0f6c5ede57..40a4fc4a1739c4 100644
--- a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt
@@ -59,7 +59,7 @@ if(NOT (ARM OR AARCH64))
          ${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests/instances/arm
          ${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/arm
          ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/snippets/arm
-         ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/low_precision_transformations/arm
+         ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/low_precision_transformations/aarch64
          ${CMAKE_CURRENT_SOURCE_DIR}/utils/arm)
 else()
     # temporary disable all custom tests for ARM
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
index ed9b51d141b59c..e351fb607c6e8b 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/x64/fully_connected_transformation.cpp
@@ -14,7 +14,7 @@ const std::vector<ov::element::Type> netPrecisions = {
         ov::element::f32
 };
 
-const std::vector<MatMulShapes> shapes = {
+const std::vector<FullyConnectedShapes> shapes = {
     {
         ov::PartialShape{ 1, 16 },
         ov::PartialShape{ 16, 8 },
@@ -39,14 +39,35 @@ const std::vector<ov::pass::low_precision::LayerTransformation::Params> trasform
     LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams()
 };
 
+const std::vector<FullyConnectedParams> activations = {
+    {
+        true,  // activation
+        false, // per-channel
+        true,  // FQ
+        "fullyconnected,relu_original,relu"
+    },
+    {
+        false, // activation
+        false, // per-channel
+        true,  // FQ
+        "fullyConnected_original,fullyConnected"
+    },
+    {
+        true,  // activation
+        true,  // per-channel
+        false, // FQ
+        "fullyconnected,relu_original,relu"
+    },
+};
+
 INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
         ::testing::Values(ov::test::utils::DEVICE_CPU),
         ::testing::ValuesIn(trasformationParamValues),
-        ::testing::ValuesIn({ov::element::i8, ov::element::u8}),
-        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({ov::element::i8/*, ov::element::u8*/}),
+        ::testing::ValuesIn(activations),
         ::testing::Values("")),
     FullyConnectedTransformation::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
index 67f83c29968b51..bebac98993450e 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
@@ -15,7 +15,7 @@ const std::vector<ov::element::Type> netPrecisions = {
     ov::element::f16
 };
 
-const std::vector<MatMulShapes> shapes = {
+const std::vector<FullyConnectedShapes> shapes = {
     {
         { 1, 16 },
         { 16, 8 },
@@ -40,12 +40,35 @@ const std::vector<ov::pass::low_precision::LayerTransformation::Params> trasform
     LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams()
 };
 
+const std::vector<FullyConnectedParams> activations = {
+    {
+        true,  // activation
+        false, // per-channel
+        true,  // FQ
+        ""
+    },
+    {
+        false, // activation
+        false, // per-channel
+        true,  // FQ
+        ""
+    },
+    {
+        true,  // activation
+        true,  // per-channel
+        false, // FQ
+        ""
+    },
+};
+
 INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
-        ::testing::Values(ov::test::utils::DEVICE_GPU),
+        ::testing::Values(ov::test::utils::DEVICE_CPU),
         ::testing::ValuesIn(trasformationParamValues),
-        ::testing::ValuesIn({ov::element::i8, ov::element::u8})),
+        ::testing::ValuesIn({ov::element::i8/*, ov::element::u8*/}),
+        ::testing::ValuesIn(activations),
+        ::testing::Values("")),
     FullyConnectedTransformation::getTestCaseName);
 }  // namespace
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
index 071a9e3dd9b559..5de4424019ae4e 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
@@ -70,16 +70,20 @@ TEST_P(FullyConnectedTransformation, CompareWithRefImpl) {
     run();
 
     const auto& activation = std::get<5>(GetParam());
-    const auto originalLayersNames = get_property_by_type("FullyConnected", "originalLayersNames");
-    EXPECT_EQ(ov::util::to_lower(activation.originalLayersNames), originalLayersNames);
+    if (!activation.originalLayersNames.empty()) {
+        const auto originalLayersNames = get_property_by_type("FullyConnected", "originalLayersNames");
+        EXPECT_EQ(ov::util::to_lower(activation.originalLayersNames), originalLayersNames);
+    }
 
     const auto& actualPrecision = get_runtime_precision_by_type("FullyConnected");
     const auto expectedPrecision = std::get<4>(GetParam());
     EXPECT_EQ(actualPrecision, expectedPrecision.to_string());
 
     const auto& expectedPrimitiveType = std::get<6>(GetParam());
-    const std::string actualPrimitiveType = get_property_by_type("FullyConnected", "primitiveType");
-    EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType);
+    if (!expectedPrimitiveType.empty()) {
+        const std::string actualPrimitiveType = get_property_by_type("FullyConnected", "primitiveType");
+        EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType);
+    }
 };
 
 }  // namespace LayerTestsDefinitions

From d4c30c6058f847cccc25d2610079ffa71d776f9e Mon Sep 17 00:00:00 2001
From: eshoguli <edward.shogulin@intel.com>
Date: Fri, 16 Aug 2024 10:23:58 +0100
Subject: [PATCH 3/5] cleanup and refactoring

---
 .../executors/acl/acl_fullyconnected.hpp      |  2 +-
 .../executors/acl/acl_lowp_fullyconnected.cpp | 21 +++++--------------
 .../src/nodes/executors/acl/acl_weights.cpp   |  6 ++----
 .../fullyconnected_implementations.cpp        |  3 +--
 .../transformation_pipeline.cpp               |  4 ----
 .../mat_mul_transformation.hpp                |  2 --
 .../mat_mul_transformation.cpp                | 21 +++++++------------
 7 files changed, 17 insertions(+), 42 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
index 4fb4703ba2742b..7bf7a47c3a7b39 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
index eb8af61898754d..cfb94b8bafd2bd 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -4,19 +4,16 @@
 
 #include "acl_lowp_fullyconnected.hpp"
 
+#include "acl_weights.hpp"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-
+#include "memory_desc/cpu_memory_desc_utils.h"
+#include "nodes/common/cpu_convert.h"
 #include "nodes/executors/acl/acl_utils.hpp"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/memory_arguments.hpp"
-#include "utils/debug_capabilities.h"
 #include "nodes/executors/debug_messages.hpp"
 #include "nodes/executors/implementation_utils.hpp"
-#include "acl_weights.hpp"
-#include "acl_utils.hpp"
-
-#include "nodes/common/cpu_convert.h"
-#include "memory_desc/cpu_memory_desc_utils.h"
+#include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -41,9 +38,7 @@ static void initFCAttrs(const FCAttrs &attrs,
                         arm_compute::GEMMInfo& fullyConnectedLayerInfo,
                         const PostOps &postOps) {
     aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
-    //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
     aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
-    //fullyConnectedLayerInfo.transpose_weights = false;
     aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
 
     if (!postOps.empty()) {
@@ -86,11 +81,6 @@ void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShap
 }
 
 arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) {
-    // TODO: debug only
-    //const auto src0 = aclMemoryInfos[ACLArgs::ACL_SRC_0].get();
-    //const auto src1 = aclMemoryInfos[ACLArgs::ACL_WEI].get();
-    //const auto dst = aclMemoryInfos[ACLArgs::ACL_DST].get();
-
     auto &tensor_info = aclMemoryInfos[ACLArgs::ACL_SRC_0];
     if (dequantizationScales.empty()) {
         tensor_info->set_quantization_info(arm_compute::QuantizationInfo(1.f));
@@ -104,7 +94,7 @@ arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACL
     const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
             aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
             aclMemoryInfos[ACLArgs::ACL_WEI].get(),
-            nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
             aclMemoryInfos[ACLArgs::ACL_DST].get(),
             gemmInfo);
     return matMulValid;
@@ -126,7 +116,6 @@ ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLTensors &
     return gemm;
 }
 
-// TODO: move to ACLLowpExecutor
 std::shared_ptr<arm_compute::TensorInfo> ACLLowpFullyConnectedExecutor::initTensorInfo(
         const arm_compute::TensorShape& tensorShape,
         const arm_compute::DataType& dataType,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
index 819be7ef057dce..27e1ba2194f4de 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
@@ -4,13 +4,11 @@
 
 #include "acl_fullyconnected.hpp"
 #include "acl_utils.hpp"
+#include "nodes/common/cpu_convert.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/memory_arguments.hpp"
-#include "utils/debug_capabilities.h"
-#include "nodes/executors/debug_messages.hpp"
-#include "nodes/executors/implementation_utils.hpp"
-#include "nodes/common/cpu_convert.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
+#include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index 92b721726a9e2f..3280321ea722b0 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -90,7 +90,7 @@ static const TypeMapping aclFCTypeMapping {
 
 static const TypeMapping aclLowpFCTypeMapping {
     // {src, wei, bia, dst}                  pt<src, wei, bias, dst>
-    {{_i8, _i8, _any, _f32},                 pt(just<i8>(), just<i8>(), just<i32>(), just<f32>())}
+    {{_i8, _i8, _any, _f32},                 pt(just<i8>(), just<i8>(), just<f32>(), just<f32>())}
 };
 
 static const MappingNotation dnnlConvolutionMappingNotation {
@@ -396,7 +396,6 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
             },
             // acceptsShapes
             [](const MemoryArgs& memory) -> bool {
-                // @todo create syntactic sugar (functor) for shape agnostic lambda
                 return true;
             },
             // create
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 9215906925b46e..38649b2906e9e3 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -725,11 +725,7 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
                 }),
             PrecisionsRestriction::create<ov::opset1::MatMul>({
                     {{0}, {ov::element::u8, ov::element::i8}},
-#if defined(OPENVINO_ARCH_ARM64)
-                    {{1}, {ov::element::u8, ov::element::i8}}
-#else
                     {{1}, {ov::element::i8}}
-#endif
                 }),
             PrecisionsRestriction::create<ov::opset5::LSTMSequence>({
                     {{0, 1}, {ov::element::u8}}
diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
index 74dd7450130360..3d394c1e45674c 100644
--- a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
+++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp
@@ -19,10 +19,8 @@ class MatMulTransformationTestValues {
     ov::builder::subgraph::FakeQuantizeOnData fqOnData1;
     ov::Shape inputShape2;
     ov::builder::subgraph::FakeQuantizeOnData fqOnData2;
-    // TODO: remove, not used
     std::string expectedKernelName;
     std::string expectedRuntimePrecision;
-    bool requantization;
 };
 
 typedef std::tuple<
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
index d1d4ea94c6b01b..bc1ce628deb245 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
@@ -27,11 +27,10 @@ std::string MatMulTransformation::getTestCaseName(const testing::TestParamInfo<M
     result <<
         precision << "_" <<
         targetDevice << "_" <<
-        "IN1=" << testValues.inputShape1 << "_" <<
+        testValues.inputShape1 << "_" <<
         testValues.fqOnData1 << "_" <<
-        "IN2=" << testValues.inputShape2 << "_" <<
-        testValues.fqOnData2 << "_" <<
-        testValues.requantization;
+        testValues.inputShape2 << "_" <<
+        testValues.fqOnData2;
 
     return result.str();
 }
@@ -46,12 +45,11 @@ void MatMulTransformation::SetUp() {
     init_input_shapes({ testValues.inputShape1, testValues.inputShape2 });
 
     function = ov::builder::subgraph::MatMulFunction::getOriginal(
-        ov::element::f16, // precision,
+        precision,
         testValues.inputShape1,
         testValues.fqOnData1,
         testValues.inputShape2,
-        testValues.fqOnData2,
-        testValues.requantization);
+        testValues.fqOnData2);
 
     ov::pass::InitNodeInfo().run_on_model(function);
 }
@@ -61,13 +59,10 @@ void MatMulTransformation::run() {
 
     LayerTransformation::run();
 
-    const auto& actualType = get_runtime_precision_by_type("MatMul");
-    const auto expected = std::get<3>(GetParam());
-    EXPECT_EQ(expected.expectedRuntimePrecision, actualType);
+    const auto params = std::get<3>(GetParam());
+    const auto actualType = get_runtime_precision(params.expectedKernelName);
 
-    const auto& actualPrimitiveType = get_property_by_type("MatMul", "primitiveType");
-    const auto expectedPrimitiveType = "gemm_acl_i8";
-    EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType);
+    EXPECT_EQ(actualType, params.expectedRuntimePrecision);
 }
 
 TEST_P(MatMulTransformation, CompareWithRefImpl) {

From 5dbd319b518060f817e1f99d81d8734e66f063af Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Thu, 14 Nov 2024 17:12:52 +0100
Subject: [PATCH 4/5] address comments #1

---
 .../src/nodes/executors/acl/acl_fullyconnected.hpp   |  2 +-
 ...{acl_weights.cpp => acl_fullyconnected_utils.cpp} |  0
 ...{acl_weights.hpp => acl_fullyconnected_utils.hpp} |  0
 .../nodes/executors/acl/acl_lowp_fullyconnected.cpp  |  2 +-
 .../nodes/executors/acl/acl_lowp_fullyconnected.hpp  |  2 +-
 .../executors/fullyconnected_implementations.cpp     |  2 +-
 .../fully_connected_transformation.cpp               |  2 +-
 .../layer_transformation.cpp                         | 12 ++++++------
 .../ov_lpt_models/include/ov_lpt_models/mat_mul.hpp  |  2 +-
 src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp   |  4 ++--
 10 files changed, 14 insertions(+), 14 deletions(-)
 rename src/plugins/intel_cpu/src/nodes/executors/acl/{acl_weights.cpp => acl_fullyconnected_utils.cpp} (100%)
 rename src/plugins/intel_cpu/src/nodes/executors/acl/{acl_weights.hpp => acl_fullyconnected_utils.hpp} (100%)

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
index 7bf7a47c3a7b39..2bdbd9f163bc50 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -6,7 +6,7 @@
 
 #include "acl_common_executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
-#include "acl_weights.hpp"
+#include "acl_fullyconnected_utils.hpp"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
similarity index 100%
rename from src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp
rename to src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp
similarity index 100%
rename from src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp
rename to src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
index cfb94b8bafd2bd..07a288edd57cb3 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -4,7 +4,7 @@
 
 #include "acl_lowp_fullyconnected.hpp"
 
-#include "acl_weights.hpp"
+#include "acl_fullyconnected_utils.hpp"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "nodes/common/cpu_convert.h"
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
index 4d58c7908e61c4..d69c6fc438896d 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
@@ -6,7 +6,7 @@
 
 #include "acl_common_executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
-#include "acl_weights.hpp"
+#include "acl_fullyconnected_utils.hpp"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index 3280321ea722b0..fffb6c46173383 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -90,7 +90,7 @@ static const TypeMapping aclFCTypeMapping {
 
 static const TypeMapping aclLowpFCTypeMapping {
     // {src, wei, bia, dst}                  pt<src, wei, bias, dst>
-    {{_i8, _i8, _any, _f32},                 pt(just<i8>(), just<i8>(), just<f32>(), just<f32>())}
+    {{_i8, _i8, _any, _f32},                 pt(bypass(), bypass(), just(), bypass())}
 };
 
 static const MappingNotation dnnlConvolutionMappingNotation {
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
index bebac98993450e..4146e370cd1165 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
@@ -65,7 +65,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
-        ::testing::Values(ov::test::utils::DEVICE_CPU),
+        ::testing::Values(ov::test::utils::DEVICE_GPU),
         ::testing::ValuesIn(trasformationParamValues),
         ::testing::ValuesIn({ov::element::i8/*, ov::element::u8*/}),
         ::testing::ValuesIn(activations),
diff --git a/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp b/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
index 0a5e59b9f23e66..a1781c5826e590 100644
--- a/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
@@ -61,7 +61,7 @@ std::string LayerTransformation::get_test_case_name_by_params(
 
 namespace {
 template <typename IsNodeF>
-std::string find_node_by_runtime_precision(const ov::CompiledModel& execNet, IsNodeF is_node_f, const std::string& propertyName = "runtimePrecision") {
+std::string find_node_by_runtime_property(const ov::CompiledModel& execNet, IsNodeF is_node_f, const std::string& propertyName = "runtimePrecision") {
     const std::shared_ptr<const ov::Model>& execFunction = execNet.get_runtime_model();
 
     for (const auto& op : execFunction->get_ops()) {
@@ -69,7 +69,7 @@ std::string find_node_by_runtime_precision(const ov::CompiledModel& execNet, IsN
             continue;
         const ov::RTMap& rtInfo = op->get_rt_info();
         const auto& it = rtInfo.find(propertyName);
-        OPENVINO_ASSERT(it != rtInfo.end(), "Runtime precision is not found for node: ", op->get_friendly_name());
+        OPENVINO_ASSERT(it != rtInfo.end(), "Runtime property \"", propertyName, "\" is not found for node: ", op->get_friendly_name());
         return it->second.as<std::string>();
     }
 
@@ -81,7 +81,7 @@ std::string LayerTransformation::get_runtime_precision(const std::string& layerN
     auto is_node_f = [layerName](const std::shared_ptr<ov::Node>& op) {
         return op->get_friendly_name() == layerName;
     };
-    return find_node_by_runtime_precision(compiledModel, is_node_f);
+    return find_node_by_runtime_property(compiledModel, is_node_f);
 }
 
 std::string LayerTransformation::get_runtime_precision_by_type(const std::string& layerType) {
@@ -92,7 +92,7 @@ std::string LayerTransformation::get_runtime_precision_by_type(const std::string
         OPENVINO_ASSERT(typeIt != rtInfo.end(), "Layer is not found for type: ", layerType);
         return typeIt->second.as<std::string>() == layerType;
     };
-    return find_node_by_runtime_precision(compiledModel, is_node_f);
+    return find_node_by_runtime_property(compiledModel, is_node_f);
 }
 
 std::string LayerTransformation::get_property_by_type(const std::string& layerTypeName, const std::string& propertyName) {
@@ -103,7 +103,7 @@ std::string LayerTransformation::get_property_by_type(const std::string& layerTy
         OPENVINO_ASSERT(typeIt != rtInfo.end(), "Layer is not found for type: ", layerTypeName);
         return typeIt->second.as<std::string>() == layerTypeName;
     };
-    return ov::util::to_lower(find_node_by_runtime_precision(compiledModel, is_node_f, propertyName));
+    return ov::util::to_lower(find_node_by_runtime_property(compiledModel, is_node_f, propertyName));
 }
 
 namespace {
@@ -128,7 +128,7 @@ std::string LayerTransformation::get_runtime_precision_by_fused_name(const std::
         OPENVINO_ASSERT(nameIt != rtInfo.end(), "originalLayersNames is not found for node: ", layerName);
         return has_layer(nameIt->second.as<std::string>(), layerName);
     };
-    return find_node_by_runtime_precision(compiledModel, is_node_f);
+    return find_node_by_runtime_property(compiledModel, is_node_f);
 }
 
 bool LayerTransformation::check_execution_order(const std::vector<std::string>& orderedOpsTypes) {
diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
index b4a304a58dd5d0..41a22f50858f2e 100644
--- a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
+++ b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
@@ -29,7 +29,7 @@ class MatMulFunction {
         const bool transpose1,
         const bool transpose2,
         const bool signedWeights,
-        const bool perChannelWeights,
+        const bool perChannelWeightsDequantization ,
         const bool relu,
         const bool fq);
 
diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
index e4a44b4b6a70ed..5f8d6086e98849 100644
--- a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
+++ b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
@@ -79,7 +79,7 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
         const bool transpose1,
         const bool transpose2,
         const bool signedOnWeights,
-        const bool perChannelWeights,
+        const bool perChannelWeightsDequantization,
         const bool relu,
         const bool fq) {
     const auto paramNode = std::make_shared<ov::opset1::Parameter>(precision, inputShape1);
@@ -103,7 +103,7 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
                 inputShape2.to_shape(),
                 generate_values<float>(inputShape2.to_shape(), 10.f));
 
-        parentOnWeights = perChannelWeights ?
+        parentOnWeights = perChannelWeightsDequantization ?
                           ov::test::utils::make_fake_quantize(
                                   weightsConst, precision, 256ul,
                                   Shape{channel, 1},

From e6d4880e489d4beb4635b09599542f5e8b90cd7a Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Thu, 14 Nov 2024 20:52:31 +0100
Subject: [PATCH 5/5] fixed build

---
 .../executors/acl/acl_fullyconnected.cpp      | 284 +--------------
 .../executors/acl/acl_fullyconnected.hpp      |  36 --
 .../acl/acl_fullyconnected_utils.cpp          | 330 +++++++++++++++++-
 .../acl/acl_fullyconnected_utils.hpp          | 101 +++++-
 .../executors/acl/acl_lowp_fullyconnected.cpp |   2 +-
 .../executors/acl/acl_lowp_fullyconnected.hpp |   3 +-
 .../fullyconnected_implementations.cpp        |   2 +-
 .../include/ov_lpt_models/mat_mul.hpp         |   2 +-
 8 files changed, 433 insertions(+), 327 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
index ea5cdc5507a726..124e972e9fbdca 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -7,6 +7,7 @@
 
 #include "ov_optional.hpp"
 #include "acl_fullyconnected.hpp"
+#include "acl_fullyconnected_utils.hpp"
 #include "acl_utils.hpp"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/memory_arguments.hpp"
@@ -22,232 +23,6 @@
 namespace ov {
 namespace intel_cpu {
 
-static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
-    const auto& weightDims = wShape.getStaticDims();
-
-    auto inMinDims = inShape.getMinDims();
-    auto inMaxDims = inShape.getMaxDims();
-    inMinDims.back() = weightDims.back();
-    inMaxDims.back() = weightDims.back();
-
-    return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
-}
-
-static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
-    size_t activationRank = inShape.size();
-    size_t channelRank = wShape.size() - 1;
-    // activation   weight    output_shape
-    // NCHW         CoCHW     NCo
-    // TNC          CoC       TNCo
-    // NC           CoC       NCo
-    VectorDims outputShape(out_rank, 1);
-    // set Co
-    outputShape.back() = wShape[0];
-    // set batch dims
-    size_t batchRank = activationRank - channelRank;
-    size_t startIdx = out_rank - batchRank - 1;
-    for (size_t i = 0; i < batchRank; i++) {
-        outputShape[i + startIdx] = inShape[i];
-    }
-
-    return outputShape;
-}
-
-static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc,
-                                                        const DnnlMemoryDescPtr dstDesc) {
-    const auto& weiDesc = srcDesc->getDnnlDesc();
-    const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
-    const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
-
-    return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
-}
-
-static ov::optional<MemoryPtr> convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) {
-    MemoryArgs memoryArgs;
-    memoryArgs[ARG_SRC] = input;
-    memoryArgs[ARG_DST] = output;
-
-    auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
-    if (aclWeightsConverter->update(memoryArgs)) {
-        aclWeightsConverter->execute(memoryArgs);
-        return ov::optional<MemoryPtr>(memoryArgs.at(ARG_DST));
-    }
-
-    if (!node::Convert::isSupportedDesc(input->getDesc()) ||
-        !node::Convert::isSupportedDesc(output->getDesc())) {
-        return {};
-    }
-
-    auto data = static_cast<const uint8_t *>(input->getData());
-    std::vector<uint8_t> tmpBuff;
-    tmpBuff.resize(output->getSize());
-    cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()),
-                weightPrecision, input->getSize() / input->getDesc().getPrecision().size());
-
-    return ov::optional<MemoryPtr>(std::make_shared<Memory>(output->getPrimitive().get_engine(),
-                                                            output->getDesc().cloneWithNewPrecision(weightPrecision),
-                                                            tmpBuff.data()));
-}
-
-static ov::optional<MemoryPtr> reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) {
-    if (output->getDataType() == input->getDataType()) {
-        return {};
-    }
-    const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType());
-    auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc);
-    dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(),
-                                                         output->getPrimitive().get_engine(),
-                                                         input->getPrimitive().get_desc(),
-                                                         MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc());
-
-    if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) {
-        auto convertOutput = convertWeightPrecision(input, output, inPrc);
-        if (!convertOutput) {
-            return {};
-        }
-        input = *convertOutput;
-
-        if (reorderWithoutConvert) {
-            dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order);
-            reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
-            return ov::optional<MemoryPtr>(output);
-        }
-    }
-    return {};
-}
-
-static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc,
-                             DnnlMemoryDescPtr dstWeightDesc,
-                             MemoryCPtr weightsMem,
-                             ExecutorContext::CPtr context) {
-    MemoryPtr input = std::make_shared<Memory>(context->getEngine(), srcWeightDesc, weightsMem->getData());
-    MemoryPtr output = std::make_shared<Memory>(context->getEngine(), dstWeightDesc);
-    if (!input->getDesc().isDefined() || !output->getDesc().isDefined())
-        OPENVINO_THROW("Can't reorder data with dynamic shapes");
-
-    if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) {
-        return output;
-    }
-
-    if (input->getDesc().isCompatible(output->getDesc())) {
-        auto srcPtr = static_cast<uint8_t*>(input->getData());
-        auto dstPtr = static_cast<uint8_t*>(output->getData());
-        auto copySize = output->getSize();
-        cpu_memcpy(dstPtr, srcPtr, copySize);
-        return output;
-    }
-
-    // try directly reorder
-    auto engine = output->getPrimitive().get_engine();
-    dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(),
-                                                 engine,
-                                                 input->getPrimitive().get_desc(),
-                                                 output->getPrimitive().get_desc());
-
-    if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) {
-        // try precision conversion then do the reorder
-        auto fallbackOutput = reorderDataFallback(input, output, context);
-        if (fallbackOutput) {
-            return *fallbackOutput;
-        }
-    }
-    // if precision conversion does not work then do direct reference reorder
-    if (directReorder) {
-        dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
-        directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
-    } else {
-        OPENVINO_THROW("Could not make onednn reorder.");
-    }
-    return output;
-}
-
-static MemoryPtr reorderWeights(const MemoryArgs &memory,
-                                const ExecutorContext::CPtr context,
-                                ACLFCAttrs& aclfcAttrs,
-                                DnnlMemoryDescPtr dnnlSrcDesc,
-                                DnnlMemoryDescPtr dnnlDstDesc) {
-    auto create = [&]() {
-        MemoryPtr weightsMemory = memory.at(ARG_WEI);
-        if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) {
-            weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context);
-            DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
-        }
-        return weightsMemory;
-    };
-
-    auto weightCache = context->getWeightsCache();
-    if (weightCache != nullptr) {
-        const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
-        const auto N = wgtDims[0];
-        const auto K = wgtDims[1];
-        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
-        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
-                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
-        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
-        return *weightCache->findOrCreate(string_hash, create);
-    }
-
-    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
-    return create();
-}
-
-static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
-                                     const ExecutorContext::CPtr context,
-                                     const FCAttrs &attrs,
-                                     ACLFCAttrs& aclfcAttrs,
-                                     const PostOps &postOps,
-                                     arm_compute::WeightFormat& expectedWeightFormat,
-                                     arm_compute::TensorInfo& weiTensorInfo) {
-    MemoryArgs memoryArgs;
-    memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
-    memoryArgs[ARG_WEI]   = memory.at(ARG_WEI);
-    if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
-        const auto& inShape = memory.at(ARG_SRC_0)->getShape();
-        const auto& wShape = memory.at(ARG_WEI)->getShape();
-        const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
-        const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
-        memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
-                                                            memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
-        memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                        memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
-    } else {
-        memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
-        memoryArgs[ARG_DST]   = memory.at(ARG_DST);
-    }
-    // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor
-    // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability
-    auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
-    bool isNeededReorder = aclWeightsRepack->update(memoryArgs);
-    expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED;
-    weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI);
-
-    MemoryPtr dstMemPtr = std::make_shared<Memory>(context->getEngine(),
-                                                   memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision));
-    auto dstDesc = dstMemPtr->getDescPtr();
-    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
-    auto weiDesc = memory.at(ARG_WEI)->getDescPtr();
-    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc);
-
-    if (isNeededReorder) {
-        dnnl::impl::dim_t o_dim = 0;
-        dnnl::impl::dim_t inner_dim = 1;
-        std::vector<dnnl::impl::dim_t> remaining_dims = {};
-        auto weights_md_ = dnnlDstDesc->getDnnlDesc().get();
-        dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat,
-                                                                  inner_dim, o_dim, remaining_dims, {});
-        if (aclfcAttrs.weightsNonTransposed) {
-            dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
-        }
-        aclfcAttrs.isWeightsRepacked = true;
-        return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
-    }
-    if (!aclfcAttrs.weightsNonTransposed) {
-        dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
-        aclfcAttrs.isWeightsRepacked = true;
-    }
-    return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
-}
-
 static bool checkPostOps(const PostOps &postOps) {
     if (postOps.empty()) {
         return true;
@@ -292,7 +67,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs,
                                                      const MemoryArgs &memory,
                                                      const ExecutorContext::CPtr context) {
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-    packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo);
 }
 
 bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
@@ -305,18 +80,6 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
     return true;
 }
 
-static arm_compute::TensorShape normalizeDimsTo2D(const arm_compute::TensorShape shape) {
-    size_t norm_dim = std::accumulate(shape.begin() + 1, shape.end(), 1, std::multiplies<size_t>());
-    return arm_compute::TensorShape(shape[0], norm_dim);
-}
-
-static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) {
-    aclMemoryShapes[ACLArgs::ACL_WEI] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_WEI]);
-    aclMemoryShapes[ACLArgs::ACL_SRC_0] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_SRC_0]);
-    aclMemoryShapes[ACLArgs::ACL_DST] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_DST]);
-    std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
-}
-
 void ACLFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
     acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
 }
@@ -358,48 +121,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclM
     return neFC;
 }
 
-arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-    return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                         aclMemoryInfos[ACLArgs::ACL_DST].get(),
-                                         arm_compute::ConvertPolicy::SATURATE);
-}
-
-ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) {
-    auto neCast = std::make_unique<arm_compute::NECast>();
-    neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
-                      arm_compute::ConvertPolicy::SATURATE);
-    return neCast;
-}
-
-acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs,
-                                                                    const PostOps &postOps,
-                                                                    const MemoryArgs &memory) {
-    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-}
-
-void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
-}
-
-arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
-    if (aclfcAttrs.isConvertedWeights) {
-        aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type());
-    }
-    int icTotal = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0);
-    return arm_compute::NEFullyConnectedLayer::has_opt_impl(
-            expectedWeightFormat,
-            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
-            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
-            aclMemoryInfos[ACLArgs::ACL_DST].get(),
-            fullyConnectedLayerInfo,
-            arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY));
-}
-
-ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) {
-    return std::make_unique<arm_compute::NEFullyConnectedLayer>();
-}
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
index 8d03cf50d48ff3..4db9b95031c803 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -11,42 +11,6 @@
 namespace ov {
 namespace intel_cpu {
 
-struct ACLFCAttrs {
-    ov::element::Type inputPrecision;
-    bool isConvertedWeights = false;
-    bool isWeightsRepacked = false;
-    bool weightsNonTransposed;
-};
-
-namespace acl_fc_executor {
-
-class ACLWeightsConverter : public ACLCommonExecutor {
-public:
-    ACLWeightsConverter() = default;
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-};
-
-class ACLWeightFormatGenerator : public ACLCommonExecutor {
-public:
-    ACLWeightFormatGenerator(const FCAttrs& attrs,
-                             const PostOps& postOps,
-                             const MemoryArgs& memory);
-    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;
-    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
-    arm_compute::WeightFormat getOptImplWeightFormat() {
-        return expectedWeightFormat;
-    }
-private:
-    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
-    ACLFCAttrs aclfcAttrs;
-    arm_compute::WeightFormat expectedWeightFormat;
-};
-
-}  // namespace acl_fc_executor
-
 class ACLFullyConnectedExecutor : public ACLCommonExecutor {
 public:
     ACLFullyConnectedExecutor(const FCAttrs& attrs,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
index f2f48238996dec..2df8c449b00c8f 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
@@ -1,18 +1,249 @@
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #include "acl_fullyconnected.hpp"
 #include "acl_utils.hpp"
+#include "nodes/convert.h"
 #include "nodes/common/cpu_convert.h"
+#include "nodes/common/cpu_memcpy.h"
+#include "nodes/common/reorder_prim.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "utils/debug_capabilities.h"
 
+#include <cpu/acl/acl_utils.hpp>
+#include <common/primitive_desc_iface.hpp>
+
 namespace ov {
 namespace intel_cpu {
 
+VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
+    const auto& weightDims = wShape.getStaticDims();
+
+    auto inMinDims = inShape.getMinDims();
+    auto inMaxDims = inShape.getMaxDims();
+    inMinDims.back() = weightDims.back();
+    inMaxDims.back() = weightDims.back();
+
+    return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
+}
+
+VectorDims acl_fc_executor::makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
+    size_t activationRank = inShape.size();
+    size_t channelRank = wShape.size() - 1;
+    // activation   weight    output_shape
+    // NCHW         CoCHW     NCo
+    // TNC          CoC       TNCo
+    // NC           CoC       NCo
+    VectorDims outputShape(out_rank, 1);
+    // set Co
+    outputShape.back() = wShape[0];
+    // set batch dims
+    size_t batchRank = activationRank - channelRank;
+    size_t startIdx = out_rank - batchRank - 1;
+    for (size_t i = 0; i < batchRank; i++) {
+        outputShape[i + startIdx] = inShape[i];
+    }
+
+    return outputShape;
+}
+
+DnnlMemoryDescPtr acl_fc_executor::makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc,
+                                                        const DnnlMemoryDescPtr dstDesc) {
+    const auto& weiDesc = srcDesc->getDnnlDesc();
+    const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
+    const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
+
+    return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
+}
+
+ov::optional<MemoryPtr> acl_fc_executor::convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) {
+    MemoryArgs memoryArgs;
+    memoryArgs[ARG_SRC] = input;
+    memoryArgs[ARG_DST] = output;
+
+    auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
+    if (aclWeightsConverter->update(memoryArgs)) {
+        aclWeightsConverter->execute(memoryArgs);
+        return ov::optional<MemoryPtr>(memoryArgs.at(ARG_DST));
+    }
+
+    if (!node::Convert::isSupportedDesc(input->getDesc()) ||
+        !node::Convert::isSupportedDesc(output->getDesc())) {
+        return {};
+    }
+
+    auto data = static_cast<const uint8_t *>(input->getData());
+    std::vector<uint8_t> tmpBuff;
+    tmpBuff.resize(output->getSize());
+    cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()),
+                weightPrecision, input->getSize() / input->getDesc().getPrecision().size());
+
+    return ov::optional<MemoryPtr>(std::make_shared<Memory>(output->getPrimitive().get_engine(),
+                                                            output->getDesc().cloneWithNewPrecision(weightPrecision),
+                                                            tmpBuff.data()));
+}
+
+ov::optional<MemoryPtr> acl_fc_executor::reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) {
+    if (output->getDataType() == input->getDataType()) {
+        return {};
+    }
+    const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType());
+    auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc);
+    dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(),
+                                                         output->getPrimitive().get_engine(),
+                                                         input->getPrimitive().get_desc(),
+                                                         MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc());
+
+    if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) {
+        auto convertOutput = convertWeightPrecision(input, output, inPrc);
+        if (!convertOutput) {
+            return {};
+        }
+        input = *convertOutput;
+
+        if (reorderWithoutConvert) {
+            dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order);
+            reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
+            return ov::optional<MemoryPtr>(output);
+        }
+    }
+    return {};
+}
+
+MemoryPtr acl_fc_executor::reorderData(DnnlMemoryDescPtr srcWeightDesc,
+                                              DnnlMemoryDescPtr dstWeightDesc,
+                                              MemoryCPtr weightsMem,
+                                              ExecutorContext::CPtr context) {
+    MemoryPtr input = std::make_shared<Memory>(context->getEngine(), srcWeightDesc, weightsMem->getData());
+    MemoryPtr output = std::make_shared<Memory>(context->getEngine(), dstWeightDesc);
+    if (!input->getDesc().isDefined() || !output->getDesc().isDefined())
+        OPENVINO_THROW("Can't reorder data with dynamic shapes");
+
+    if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) {
+        return output;
+    }
+
+    if (input->getDesc().isCompatible(output->getDesc())) {
+        auto srcPtr = static_cast<uint8_t*>(input->getData());
+        auto dstPtr = static_cast<uint8_t*>(output->getData());
+        auto copySize = output->getSize();
+        cpu_memcpy(dstPtr, srcPtr, copySize);
+        return output;
+    }
+
+    // try directly reorder
+    auto engine = output->getPrimitive().get_engine();
+    dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(),
+                                                 engine,
+                                                 input->getPrimitive().get_desc(),
+                                                 output->getPrimitive().get_desc());
+
+    if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) {
+        // try precision conversion then do the reorder
+        auto fallbackOutput = reorderDataFallback(input, output, context);
+        if (fallbackOutput) {
+            return *fallbackOutput;
+        }
+    }
+    // if precision conversion does not work then do direct reference reorder
+    if (directReorder) {
+        dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
+        directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
+    } else {
+        OPENVINO_THROW("Could not make onednn reorder.");
+    }
+    return output;
+}
+
+MemoryPtr acl_fc_executor::reorderWeights(const MemoryArgs &memory,
+                                                 const ExecutorContext::CPtr context,
+                                                 ACLFCAttrs& aclfcAttrs,
+                                                 DnnlMemoryDescPtr dnnlSrcDesc,
+                                                 DnnlMemoryDescPtr dnnlDstDesc) {
+    auto create = [&]() {
+        MemoryPtr weightsMemory = memory.at(ARG_WEI);
+        if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) {
+            weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context);
+            DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
+        }
+        return weightsMemory;
+    };
+
+    auto weightCache = context->getWeightsCache();
+    if (weightCache != nullptr) {
+        const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
+        const auto N = wgtDims[0];
+        const auto K = wgtDims[1];
+        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
+        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
+                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
+        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
+        return *weightCache->findOrCreate(string_hash, create);
+    }
+
+    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
+    return create();
+}
+
+MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs &memory,
+                                     const ExecutorContext::CPtr context,
+                                     const FCAttrs &attrs,
+                                     ACLFCAttrs& aclfcAttrs,
+                                     const PostOps &postOps,
+                                     arm_compute::WeightFormat& expectedWeightFormat,
+                                     arm_compute::TensorInfo& weiTensorInfo) {
+    MemoryArgs memoryArgs;
+    memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
+    memoryArgs[ARG_WEI]   = memory.at(ARG_WEI);
+    if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
+        const auto& inShape = memory.at(ARG_SRC_0)->getShape();
+        const auto& wShape = memory.at(ARG_WEI)->getShape();
+        const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
+        const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
+        memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
+                                                            memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
+        memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                        memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
+    } else {
+        memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
+        memoryArgs[ARG_DST]   = memory.at(ARG_DST);
+    }
+    // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor
+    // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability
+    auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
+    bool isNeededReorder = aclWeightsRepack->update(memoryArgs);
+    expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED;
+    weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI);
+
+    MemoryPtr dstMemPtr = std::make_shared<Memory>(context->getEngine(),
+                                                   memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision));
+    auto dstDesc = dstMemPtr->getDescPtr();
+    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
+    auto weiDesc = memory.at(ARG_WEI)->getDescPtr();
+    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc);
+
+    if (isNeededReorder) {
+        dnnl::impl::dim_t o_dim = 0;
+        dnnl::impl::dim_t inner_dim = 1;
+        std::vector<dnnl::impl::dim_t> remaining_dims = {};
+        auto weights_md_ = dnnlDstDesc->getDnnlDesc().get();
+        dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat,
+                                                                  inner_dim, o_dim, remaining_dims, {});
+        if (aclfcAttrs.weightsNonTransposed) {
+            dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
+        }
+        aclfcAttrs.isWeightsRepacked = true;
+        return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+    }
+    if (!aclfcAttrs.weightsNonTransposed) {
+        dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
+        aclfcAttrs.isWeightsRepacked = true;
+    }
+    return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+}
+
 static bool checkPostOps(const PostOps &postOps) {
     // Add postops
     if (!postOps.empty() && postOps.size() == 1) {
@@ -49,5 +280,100 @@ static void initFCAttrs(const FCAttrs &attrs,
     }
 }
 
+arm_compute::TensorShape acl_fc_executor::normalizeDimsTo2D(const arm_compute::TensorShape shape) {
+    size_t norm_dim = std::accumulate(shape.begin() + 1, shape.end(), 1, std::multiplies<size_t>());
+    return arm_compute::TensorShape(shape[0], norm_dim);
+}
+
+void acl_fc_executor::updateFCTensorsShapes(ACLShapes& aclMemoryShapes) {
+    aclMemoryShapes[ACLArgs::ACL_WEI] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_WEI]);
+    aclMemoryShapes[ACLArgs::ACL_SRC_0] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_SRC_0]);
+    aclMemoryShapes[ACLArgs::ACL_DST] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_DST]);
+    std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                         aclMemoryInfos[ACLArgs::ACL_DST].get(),
+                                         arm_compute::ConvertPolicy::SATURATE);
+}
+
+ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) {
+    auto neCast = std::make_unique<arm_compute::NECast>();
+    neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
+                      arm_compute::ConvertPolicy::SATURATE);
+    return neCast;
+}
+
+
+arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                              aclMemoryInfos[ACLArgs::ACL_DST].get());
+}
+
+ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) {
+    auto neTranspose = std::make_unique<arm_compute::NETranspose>();
+    neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                           aclMemoryTensors[ACLArgs::ACL_DST].get());
+    return neTranspose;
+}
+
+acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs,
+                                                                    const PostOps &postOps,
+                                                                    const MemoryArgs &memory) {
+    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
+}
+
+void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) {
+    updateFCTensorsShapes(aclMemoryShapes);
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+    if (aclfcAttrs.isConvertedWeights) {
+        aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type());
+    }
+    return arm_compute::NEFullyConnectedLayer::has_opt_impl(
+            expectedWeightFormat,
+            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_DST].get(),
+            fullyConnectedLayerInfo,
+            weightsInfo);
+}
+
+ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) {
+    return std::make_unique<arm_compute::NEFullyConnectedLayer>();
+}
+
+arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
+#if defined(OPENVINO_ARCH_ARM64)
+    return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                                 aclMemoryInfos[ACLArgs::ACL_DST].get(),
+                                                 inWeightFormat,
+                                                 outWeightFormat);
+#else
+    return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+                                         aclMemoryInfos[ACLArgs::ACL_DST].get());
+#endif
+}
+
+ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) {
+#if defined(OPENVINO_ARCH_ARM64)
+    auto neReorderLayer = std::make_unique<arm_compute::NEReorderLayer>();
+    neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                              aclMemoryTensors[ACLArgs::ACL_DST].get(),
+                              inWeightFormat,
+                              outWeightFormat);
+    return neReorderLayer;
+#else
+    auto neCopy = std::make_unique<arm_compute::NECopy>();
+    neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+                              aclMemoryTensors[ACLArgs::ACL_DST].get());
+    return neCopy;
+#endif
+}
+
 }   // namespace intel_cpu
-}   // namespace ov
+}   // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp
index 0e1ec9dc8e95c4..63905ffa29a740 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp
@@ -1,13 +1,108 @@
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
-
 #include "acl_common_executor.hpp"
+#include "ov_optional.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
 
 namespace ov {
 namespace intel_cpu {
+
+struct ACLFCAttrs {
+    ov::element::Type inputPrecision;
+    bool isConvertedWeights = false;
+    bool isWeightsRepacked = false;
+    bool weightsNonTransposed;
+};
+
+namespace acl_fc_executor {
+
+VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape);
+
+VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank);
+
+DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc,
+                                                        const DnnlMemoryDescPtr dstDesc);
+
+ov::optional<MemoryPtr> convertWeightPrecision(MemoryPtr input,
+                                                      MemoryPtr output,
+                                                      ov::element::Type weightPrecision);
+
+ov::optional<MemoryPtr> reorderDataFallback(MemoryPtr input,
+                                                   MemoryPtr output,
+                                                   ExecutorContext::CPtr context);
+
+MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc,
+                             DnnlMemoryDescPtr dstWeightDesc,
+                             MemoryCPtr weightsMem,
+                             ExecutorContext::CPtr context);
+
+MemoryPtr reorderWeights(const MemoryArgs &memory,
+                                const ExecutorContext::CPtr context,
+                                ACLFCAttrs& aclfcAttrs,
+                                DnnlMemoryDescPtr dnnlSrcDesc,
+                                DnnlMemoryDescPtr dnnlDstDesc);
+
+MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
+                                     const ExecutorContext::CPtr context,
+                                     const FCAttrs &attrs,
+                                     ACLFCAttrs& aclfcAttrs,
+                                     const PostOps &postOps,
+                                     arm_compute::WeightFormat& expectedWeightFormat,
+                                     arm_compute::TensorInfo& weiTensorInfo);
+
+arm_compute::TensorShape normalizeDimsTo2D(const arm_compute::TensorShape shape);
+
+void updateFCTensorsShapes(ACLShapes& aclMemoryShapes);
+
+class ACLWeightsConverter : public ACLCommonExecutor {
+public:
+    ACLWeightsConverter() = default;
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+};
+
+class ACLWeightsTranspose : public ACLCommonExecutor {
+public:
+    ACLWeightsTranspose() = default;
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+};
+
+class ACLWeightFormatGenerator : public ACLCommonExecutor {
+public:
+    ACLWeightFormatGenerator(const FCAttrs& attrs,
+                             const PostOps& postOps,
+                             const MemoryArgs& memory);
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+    arm_compute::WeightFormat getOptImplWeightFormat() {
+        return expectedWeightFormat;
+    }
+private:
+    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
+    arm_compute::WeightsInfo weightsInfo;
+    ACLFCAttrs aclfcAttrs;
+    arm_compute::WeightFormat expectedWeightFormat;
+};
+
+class ACLWeightsReorder : public ACLCommonExecutor {
+public:
+    ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat,
+                      arm_compute::WeightFormat outWeightFormat)
+                      : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {}
+    void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {}
+    arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;
+    ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;
+private:
+    arm_compute::WeightFormat inWeightFormat;
+    arm_compute::WeightFormat outWeightFormat;
+};
+
+}  // namespace acl_fc_executor
 }  // namespace intel_cpu
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
index 07a288edd57cb3..099fee648532cf 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -58,7 +58,7 @@ ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attr
                                                              const MemoryArgs &memory,
                                                              const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) {
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
-    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo);
 }
 
 bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
index d69c6fc438896d..96dac857c47441 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
@@ -37,7 +37,8 @@ class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor {
 
 private:
     arm_compute::GEMMInfo gemmInfo;
-    arm_compute::WeightsInfo weightsInfo;
+    arm_compute::WeightFormat expectedWeightFormat;
+    arm_compute::TensorInfo weiTensorInfo;
 
     MemoryCPtr packedWeights;
     ACLFCAttrs aclfcAttrs;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index fffb6c46173383..ff965bc3fdc858 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -90,7 +90,7 @@ static const TypeMapping aclFCTypeMapping {
 
 static const TypeMapping aclLowpFCTypeMapping {
     // {src, wei, bia, dst}                  pt<src, wei, bias, dst>
-    {{_i8, _i8, _any, _f32},                 pt(bypass(), bypass(), just(), bypass())}
+    {{_i8, _i8, _any, _f32},                 pt(bypass(), bypass(), just<f32>(), bypass())}
 };
 
 static const MappingNotation dnnlConvolutionMappingNotation {
diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
index 41a22f50858f2e..9a4006b917bec6 100644
--- a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
+++ b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
@@ -29,7 +29,7 @@ class MatMulFunction {
         const bool transpose1,
         const bool transpose2,
         const bool signedWeights,
-        const bool perChannelWeightsDequantization ,
+        const bool perChannelWeightsDequantization,
         const bool relu,
         const bool fq);