From 47faadd09f1aa7a6a07a0ece7b279c81b0830944 Mon Sep 17 00:00:00 2001
From: Anton Voronov <anton.voronov@intel.com>
Date: Thu, 15 Dec 2022 04:21:29 -0800
Subject: [PATCH 1/2] [CPU] FullyConnected: fixed primitive caching for sparse
 decompression case

[CPU][oneDNN] sparsity: some fixes and removed unused code

[CPU][TESTS] FullyConnected: sparsity weights decompression tests

[CPU] FullyConnected: removed min sparse rate = 0.5 limitation

[CPU] fixed property CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE

[CPU][TESTS] added CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE test

[CPU][DOC] doc dixes
---
 docs/OV_Runtime_UG/supported_plugins/CPU.md   |   3 +-
 .../tests/test_runtime/test_properties.py     |   2 +-
 .../openvino/runtime/intel_cpu/properties.hpp |  16 +-
 .../intel_cpu/src/nodes/fullyconnected.cpp    |  10 +-
 .../intel_cpu/src/nodes/fullyconnected.h      |   2 -
 .../ov_executable_network/get_metric.cpp      |   8 +
 .../single_layer_tests/matmul_sparse.cpp      | 343 ++++++++++++++++++
 src/plugins/intel_cpu/thirdparty/onednn       |   2 +-
 8 files changed, 371 insertions(+), 15 deletions(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp

diff --git a/docs/OV_Runtime_UG/supported_plugins/CPU.md b/docs/OV_Runtime_UG/supported_plugins/CPU.md
index fee0b285ba294c..c9a6a0d1ccb0db 100644
--- a/docs/OV_Runtime_UG/supported_plugins/CPU.md
+++ b/docs/OV_Runtime_UG/supported_plugins/CPU.md
@@ -281,7 +281,7 @@ To enable denormals optimization in the application, the `denormals_optimization
 
 `Sparse weights decompression feature` allows to pack weights for Matrix Multiplication operations directly in the CPU plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve inference performance.
 
-To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0.5, 1\] (values from \[0, 0.5\] are not supported in current implementation, see limitations below). `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled.
+To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0, 1\]. `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled.
 
 > **NOTE**: `Sparse weights decompression feature` is disabled by default since overall speed-up highly depends on particular workload and for some cases the feature may introduce performance degradations.
 
@@ -315,7 +315,6 @@ Currently, the `sparse weights decompression feature` is supported with the foll
 2. Feature is only supported for Matrix Multiplication operations.
 3. HW target must have Intel AMX extension support (e.g., Intel® 4th Generation Xeon® processors (code name Sapphire Rapids)).
 4. The number of input and output channels of the weights must be a multiple of 64.
-5. Current feature implementation supports only sparse rate higher than 0.5.
 
 ## Additional Resources
 * [Supported Devices](Supported_Devices.md)
diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
index 08d98a4282fda1..d5c253a101baf1 100644
--- a/src/bindings/python/tests/test_runtime/test_properties.py
+++ b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -195,7 +195,7 @@ def test_properties_ro(ov_property_ro, expected_value):
         ),
         (
             properties.intel_cpu.sparse_weights_decompression_rate,
-            "SPARSE_WEIGHTS_DECOMPRESSION_RATE",
+            "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE",
             (
                 (0.1, np.float32(0.1)),
                 (2.0, 2.0),
diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
index e1d0a6eefab1ed..a3253785078a40 100644
--- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -47,7 +47,21 @@ namespace intel_cpu {
  */
 static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};
 
-static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
+/**
+ * @brief This property defines threshold for sparse weights decompression feature activation
+ * @ingroup ov_runtime_cpu_prop_cpp_api
+ *
+ * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU plugin
+ * at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model,
+ * the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed
+ * format this significantly decreases memory consumption and as a consequence improve inference performance.
+ * The following code allows to set the sparse rate value.
+ *
+ * @code
+ * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+ * @endcode
+ */
+static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 62d483b0536ebd..36f431bcecec1e 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
     dnnl::memory::desc wgh_candidate;
     if (useSparseWeights) {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
-                wdt, memory::desc::packed(nnzCount) };
+                wdt, memory::desc::packed() };
     } else {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
                                         wdt, dnnl::memory::format_tag::any };
@@ -930,18 +930,12 @@ bool FullyConnected::useSparseWeightsDecompression() {
             zerosCounts++;
         }
     }
-    nnzCount = elementsCount - zerosCounts;
 
     DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ",
-        zerosCounts, ", nnzCount = ", nnzCount);
+        zerosCounts, ", nnzCount = ", elementsCount - zerosCounts);
 
     weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount);
 
-    // [av] WA: there is no point in using sparse decompression when the sparse rate is low
-    // todo: add heuristic
-    if (minSparseRate < 0.5)
-        minSparseRate = 0.5;
-
     DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ",
         minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate);
 
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 91431365d59976..371d9d731ed9c9 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -42,7 +42,6 @@ class FullyConnected : public Node {
 
     void initSupportedPrimitiveDescriptors() override;
     void initOptimalPrimitiveDescriptor() override;
-    // void createPrimitive() override;
     std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
     std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
 
@@ -112,7 +111,6 @@ class FullyConnected : public Node {
 
     // sparse weights
     bool useSparseWeights = false;
-    int nnzCount = -1;
     float minSparseRate = 1.f;
     float weiSparseRate = 0.f;
     bool useSparseWeightsDecompression();
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
index b9d7028a477bef..2122a7b70720fc 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
@@ -9,6 +9,7 @@
 #include "openvino/runtime/core.hpp"
 #include "openvino/runtime/compiled_model.hpp"
 #include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 
 #include <gtest/gtest.h>
 
@@ -113,6 +114,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThrough
     ASSERT_EQ(streams, value);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CheckSparseWeigthsDecompressionRate) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+    ASSERT_NO_THROW(ov::CompiledModel compiledModel = core.compile_model(model, deviceName));
+}
+
 const std::vector<ov::AnyMap> multiDevicePriorityConfigs = {
         {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}};
 
diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp
new file mode 100644
index 00000000000000..435f407bbba3ff
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp
@@ -0,0 +1,343 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/mat_mul.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "ie_precision.hpp"
+#include "test_utils/fusing_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include <string>
+#include <ov_ops/type_relaxed.hpp>
+#include "shared_test_classes/base/utils/generate_inputs.hpp"
+#include "cpu/cpu_config.hpp"
+
+using namespace ngraph;
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+using namespace ov::test;
+
+namespace CPULayerTestsDefinitions {
+
+struct ShapeRelatedParams {
+    std::vector<InputShape> inputShapes;
+    std::pair<bool, bool> transpose;
+};
+
+typedef std::tuple<
+        ShapeRelatedParams,
+        ElementType,                        // Input precision
+        ElementType,                        // Weights precision
+        ElementType,                        // Output precision
+        fusingSpecificParams,
+        CPUSpecificParams,
+        std::map<std::string, std::string>, // Additional config
+        float                               // Weights sparse rate
+> MatMulSparseParamSet;
+
+class MatMulSparseCPUTest : public testing::WithParamInterface<MatMulSparseParamSet>,
+                            virtual public SubgraphBaseTest, public CpuTestWithFusing {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<MatMulSparseParamSet>& obj) {
+        ShapeRelatedParams shapeRelatedParams;
+        ElementType inType, weiType, outType;
+        fusingSpecificParams fusingParams;
+        CPUSpecificParams cpuParams;
+        std::map<std::string, std::string> additionalConfig;
+        float weiSparseRate;
+        std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
+            weiSparseRate) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=";
+        for (const auto& shape : shapeRelatedParams.inputShapes) {
+            result << CommonTestUtils::partialShape2str({shape.first}) << "_";
+        }
+        result << "TS=";
+        for (const auto& shape : shapeRelatedParams.inputShapes) {
+            result << "(";
+            if (!shape.second.empty()) {
+                auto itr = shape.second.begin();
+                do {
+                    result << CommonTestUtils::vec2str(*itr);
+                } while (++itr != shape.second.end() && result << "_");
+            }
+            result << ")_";
+        }
+        result << "transpose_a=" << shapeRelatedParams.transpose.first << "_";
+        result << "transpose_b=" << shapeRelatedParams.transpose.second << "_";
+        result << "inType=" << inType << "_";
+        result << "weiType=" << weiType << "_";
+        result << "outType=" << outType << "_";
+        result << CpuTestWithFusing::getTestCaseName(fusingParams);
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
+        if (!additionalConfig.empty()) {
+            result << "_PluginConf";
+            for (auto& item : additionalConfig) {
+                result << "_" << item.first << "=" << item.second;
+            }
+        }
+        result << "_weiSparseRate=" << weiSparseRate;
+
+        return result.str();
+    }
+
+protected:
+     std::string cpuNodeType;
+
+    template<typename T>
+    void transpose(T& shape) {
+        IE_ASSERT(shape.size() > 1);
+        std::swap(*(shape.end() - 1), *(shape.end() - 2));
+    }
+
+    std::vector<int8_t> inline generateSparseVector(size_t vec_len,
+                float sparseRate = 0.0f,
+                int8_t upTo = 10,
+                int8_t startFrom = 1,
+                int32_t seed = 1) {
+        std::vector<int8_t> res(vec_len);
+        std::mt19937 gen(seed);
+        std::uniform_int_distribution<long> dist(static_cast<long>(startFrom), static_cast<long>(upTo));
+
+        std::mt19937 gen_f(123);
+        std::uniform_real_distribution<float> dist_f(0.f, 1.f);
+
+        int countZero = 0;
+
+        res[0] = startFrom;
+        res[vec_len - 1] = upTo;
+        for (size_t i = 1; i < vec_len - 1; i++) {
+            if (dist_f(gen_f) > sparseRate) {
+                res[i] = static_cast<int8_t>(dist(gen));
+            } else {
+                res[i] = 0;
+                countZero++;
+            }
+        }
+
+        std::cout << "Sparse rate = " << countZero * 100 / vec_len << "%" << std::endl;
+
+        return res;
+    }
+
+    std::shared_ptr<Node> makeMatMulRelaxed(const Output<Node>& A,
+                                            const ov::PartialShape& inShapeB,
+                                            ElementType weiType,
+                                            bool transpose_a,
+                                            bool transpose_b,
+                                            const std::vector<int8_t>& weiData) {
+        using namespace ngraph;
+        auto inputParamsFP32 = builder::makeDynamicParams(element::f32, {A.get_partial_shape()});
+        auto matrixBFP32 = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);
+
+        auto matMulRelaxed = std::make_shared<op::TypeRelaxed<opset3::MatMul>>(
+            *as_type_ptr<opset3::MatMul>(builder::makeMatMul(inputParamsFP32[0], matrixBFP32, transpose_a, transpose_b)),
+            element::f32);
+
+        auto matrixB = ngraph::builder::makeConstant<int8_t>(weiType, inShapeB.get_shape(), weiData);
+
+        auto matMul = matMulRelaxed->copy_with_new_inputs({A, matrixB});
+
+        return matMul;
+    }
+
+    void SetUp() override {
+        abs_threshold = 0.5f;
+        using ngraph::pass::ConvertPrecision;
+
+        ShapeRelatedParams shapeRelatedParams;
+        ElementType inType, weiType, outType;
+        fusingSpecificParams fusingParams;
+        CPUSpecificParams cpuParams;
+        std::map<std::string, std::string> additionalConfig;
+        float weiSparseRate;
+
+        std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
+            weiSparseRate) = this->GetParam();
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+
+        configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+        init_input_shapes(shapeRelatedParams.inputShapes);
+
+        bool transpA = shapeRelatedParams.transpose.first;
+        bool transpB = shapeRelatedParams.transpose.second;
+
+        if (transpA) {
+            transpose(inputDynamicShapes[0]);
+            for (auto& shapes : targetStaticShapes) {
+                transpose(shapes[0]);
+            }
+        }
+        if (transpB) {
+            transpose(inputDynamicShapes[1]);
+            for (auto& shapes : targetStaticShapes) {
+                transpose(shapes[1]);
+            }
+        }
+
+        const auto& inShapeA = inputDynamicShapes[0];
+        const auto& inShapeB = inputDynamicShapes[1];
+
+        std::tie(postOpMgrPtr, fusedOps) = fusingParams;
+
+        configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+        cpuNodeType = "FullyConnected";
+        selectedType = makeSelectedTypeStr(selectedType, element::i8);
+
+        auto params = builder::makeDynamicParams(inType, {inShapeA});
+        auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<opset1::Parameter>(params));
+
+        auto matrixB = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);
+
+        auto weiData = generateSparseVector(ngraph::shape_size(inShapeB.get_shape()), weiSparseRate);
+        auto matMul = makeMatMulRelaxed(paramOuts[0], inShapeB, weiType, transpA, transpB, weiData);
+
+        function = makeNgraphFunction(element::f32, params, matMul, cpuNodeType);
+
+        checkFusingPosition = false;
+
+        targetDevice = CommonTestUtils::DEVICE_CPU;
+
+        functionRefs = ov::clone_model(*function);
+        ngraph::pass::ConvertPrecision<ngraph::element::Type_t::i8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
+        ngraph::pass::ConvertPrecision<ngraph::element::Type_t::u8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
+        functionRefs->validate_nodes_and_infer_types();
+    }
+};
+
+TEST_P(MatMulSparseCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    run();
+    CheckPluginRelatedResults(compiledModel, cpuNodeType);
+}
+
+namespace {
+
+/* ============= Common params ============= */
+
+std::vector<CPUSpecificParams> filterSpecificParams(bool sparseExpected) {
+    std::vector<CPUSpecificParams> specificParams;
+    if (with_cpu_x86_avx512_core_amx()) {
+        if (sparseExpected) {
+            specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx_sparse"});
+        } else {
+            specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx"});
+        }
+    }
+
+    return specificParams;
+}
+
+/* ============= FullyConnected ============= */
+namespace fullyConnected {
+
+// cpu (sparse) configs
+const std::map<std::string, std::string> emptyConfig = {};
+const std::map<std::string, std::string> SparseRate50 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.5"}};
+const std::map<std::string, std::string> SparseRate80 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.8"}};
+
+const std::vector<ShapeRelatedParams> IS2D_sparse_smoke = {
+    {static_shapes_to_test_representation({{64, 64}, {64, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{71, 64}, {64, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{3, 128}, {128, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{71, 64}, {64, 128}}), {false, true}},
+
+    {
+        {
+            {{-1, -1}, {{20, 64}, {20, 64}}},
+            {{64, 128}, {{64, 128}, {64, 128}}}
+        },
+        {false, true}
+    },
+
+    {
+        {
+            {{{0, 100}, {0, 64}}, {{20, 64}, {14, 64}, {20, 64}, {14, 64}}},
+            {{64, 128}, {{64, 128}, {64, 128}, {64, 128}, {64, 128}}}
+        },
+        {false, true}
+    },
+};
+
+const auto testParams2D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
+                                                   ::testing::Values(ElementType::i8, ElementType::u8),
+                                                   ::testing::Values(ElementType::i8),
+                                                   ::testing::Values(ElementType::f32),
+                                                   ::testing::Values(emptyFusingSpec),
+                                                   ::testing::ValuesIn(filterSpecificParams(false)),
+                                                   ::testing::Values(emptyConfig, SparseRate80),
+                                                   ::testing::Values(0.7));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8, MatMulSparseCPUTest, testParams2D_i8_smoke,
+    MatMulSparseCPUTest::getTestCaseName);
+
+const auto testParams2D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
+                                                   ::testing::Values(ElementType::i8, ElementType::u8),
+                                                   ::testing::Values(ElementType::i8),
+                                                   ::testing::Values(ElementType::f32),
+                                                   ::testing::Values(emptyFusingSpec),
+                                                   ::testing::ValuesIn(filterSpecificParams(true)),
+                                                   ::testing::Values(SparseRate50),
+                                                   ::testing::Values(0.7));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8_sparse, MatMulSparseCPUTest, testParams2D_i8_sparse_smoke,
+    MatMulSparseCPUTest::getTestCaseName);
+
+const std::vector<ShapeRelatedParams> IS3D_sparse_smoke = {
+    {static_shapes_to_test_representation({{1, 64, 64}, {64, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{3, 71, 64}, {64, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{3, 5, 128}, {128, 64}}), {false, true}},
+    {static_shapes_to_test_representation({{1, 71, 64}, {64, 128}}), {false, true}},
+
+    {
+        {
+            {{-1, -1, 64}, {{1, 5, 64}, {1, 10, 64}, {1, 5, 64}, {1, 10, 64}}},
+            {{64, 128}, {{64, 128}, {64, 128}}}
+        },
+        {false, true}
+    },
+
+    // todo: [av] investigate "Primitive descriptor was not found" error for this case
+    // {
+    //     {
+    //         {{{0, 60}, {0, 60}, {0, 64}}}, {{1, 3, 64}, {1, 7, 64}}},
+    //         {{64, 64}, {{64, 64}, {64, 64}}}
+    //     },
+    //     {false, true}
+    // },
+};
+
+const auto testParams3D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
+                                                   ::testing::Values(ElementType::i8, ElementType::u8),
+                                                   ::testing::Values(ElementType::i8),
+                                                   ::testing::Values(ElementType::f32),
+                                                   ::testing::Values(emptyFusingSpec),
+                                                   ::testing::ValuesIn(filterSpecificParams(false)),
+                                                   ::testing::Values(emptyConfig, SparseRate80),
+                                                   ::testing::Values(0.7));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8, MatMulSparseCPUTest, testParams3D_i8_smoke,
+    MatMulSparseCPUTest::getTestCaseName);
+
+const auto testParams3D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
+                                                   ::testing::Values(ElementType::i8, ElementType::u8),
+                                                   ::testing::Values(ElementType::i8),
+                                                   ::testing::Values(ElementType::f32),
+                                                   ::testing::Values(emptyFusingSpec),
+                                                   ::testing::ValuesIn(filterSpecificParams(true)),
+                                                   ::testing::Values(SparseRate50),
+                                                   ::testing::Values(0.7));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8_sparse, MatMulSparseCPUTest, testParams3D_i8_sparse_smoke,
+    MatMulSparseCPUTest::getTestCaseName);
+
+} // namespace fullyConnected
+
+} // namespace
+
+} // namespace CPULayerTestsDefinitions
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index 26ad0022000d8d..0134954ed3f6b6 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit 26ad0022000d8de10a683efb175c2acbf064e81b
+Subproject commit 0134954ed3f6b6b90eab687224b584f3c0c531df

From 9c199a12b2fab159ce51b1736b2678a9121a0908 Mon Sep 17 00:00:00 2001
From: Anton Voronov <anton.voronov@intel.com>
Date: Thu, 5 Jan 2023 14:57:15 +0000
Subject: [PATCH 2/2] code style fix

---
 .../include/openvino/runtime/intel_cpu/properties.hpp  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
index a3253785078a40..9d63a0e078bdef 100644
--- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -51,11 +51,11 @@ static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATI
  * @brief This property defines threshold for sparse weights decompression feature activation
  * @ingroup ov_runtime_cpu_prop_cpp_api
  *
- * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU plugin
- * at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model,
- * the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed
- * format this significantly decreases memory consumption and as a consequence improve inference performance.
- * The following code allows to set the sparse rate value.
+ * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU
+ * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the
+ * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded
+ * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve
+ * inference performance. The following code allows to set the sparse rate value.
  *
  * @code
  * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));