From 47faadd09f1aa7a6a07a0ece7b279c81b0830944 Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Thu, 15 Dec 2022 04:21:29 -0800 Subject: [PATCH 1/2] [CPU] FullyConnected: fixed primitive caching for sparse decompression case [CPU][oneDNN] sparsity: some fixes and removed unused code [CPU][TESTS] FullyConnected: sparsity weights decompression tests [CPU] FullyConnected: removed min sparse rate = 0.5 limitation [CPU] fixed property CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE [CPU][TESTS] added CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE test [CPU][DOC] doc dixes --- docs/OV_Runtime_UG/supported_plugins/CPU.md | 3 +- .../tests/test_runtime/test_properties.py | 2 +- .../openvino/runtime/intel_cpu/properties.hpp | 16 +- .../intel_cpu/src/nodes/fullyconnected.cpp | 10 +- .../intel_cpu/src/nodes/fullyconnected.h | 2 - .../ov_executable_network/get_metric.cpp | 8 + .../single_layer_tests/matmul_sparse.cpp | 343 ++++++++++++++++++ src/plugins/intel_cpu/thirdparty/onednn | 2 +- 8 files changed, 371 insertions(+), 15 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp diff --git a/docs/OV_Runtime_UG/supported_plugins/CPU.md b/docs/OV_Runtime_UG/supported_plugins/CPU.md index fee0b285ba294c..c9a6a0d1ccb0db 100644 --- a/docs/OV_Runtime_UG/supported_plugins/CPU.md +++ b/docs/OV_Runtime_UG/supported_plugins/CPU.md @@ -281,7 +281,7 @@ To enable denormals optimization in the application, the `denormals_optimization `Sparse weights decompression feature` allows to pack weights for Matrix Multiplication operations directly in the CPU plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve inference performance. -To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0.5, 1\] (values from \[0, 0.5\] are not supported in current implementation, see limitations below). `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled. +To use this feature, the user is provided with property `sparse_weights_decompression_rate`, which can take values from the interval \[0, 1\]. `sparse_weights_decompression_rate` defines sparse rate threashold: only operations with higher sparse rate will be executed using `sparse weights decompression feature`. The default value is `1`, which means the option is disabled. > **NOTE**: `Sparse weights decompression feature` is disabled by default since overall speed-up highly depends on particular workload and for some cases the feature may introduce performance degradations. @@ -315,7 +315,6 @@ Currently, the `sparse weights decompression feature` is supported with the foll 2. Feature is only supported for Matrix Multiplication operations. 3. HW target must have Intel AMX extension support (e.g., Intel® 4th Generation Xeon® processors (code name Sapphire Rapids)). 4. The number of input and output channels of the weights must be a multiple of 64. -5. Current feature implementation supports only sparse rate higher than 0.5. ## Additional Resources * [Supported Devices](Supported_Devices.md) diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 08d98a4282fda1..d5c253a101baf1 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -195,7 +195,7 @@ def test_properties_ro(ov_property_ro, expected_value): ), ( properties.intel_cpu.sparse_weights_decompression_rate, - "SPARSE_WEIGHTS_DECOMPRESSION_RATE", + "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE", ( (0.1, np.float32(0.1)), (2.0, 2.0), diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index e1d0a6eefab1ed..a3253785078a40 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -47,7 +47,21 @@ namespace intel_cpu { */ static constexpr Property denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"}; -static constexpr Property sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; +/** + * @brief This property defines threshold for sparse weights decompression feature activation + * @ingroup ov_runtime_cpu_prop_cpp_api + * + * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU plugin + * at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model, + * the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed + * format this significantly decreases memory consumption and as a consequence improve inference performance. + * The following code allows to set the sparse rate value. + * + * @code + * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8)); + * @endcode + */ +static constexpr Property sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 62d483b0536ebd..36f431bcecec1e 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes dnnl::memory::desc wgh_candidate; if (useSparseWeights) { wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), - wdt, memory::desc::packed(nnzCount) }; + wdt, memory::desc::packed() }; } else { wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), wdt, dnnl::memory::format_tag::any }; @@ -930,18 +930,12 @@ bool FullyConnected::useSparseWeightsDecompression() { zerosCounts++; } } - nnzCount = elementsCount - zerosCounts; DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ", - zerosCounts, ", nnzCount = ", nnzCount); + zerosCounts, ", nnzCount = ", elementsCount - zerosCounts); weiSparseRate = static_cast(zerosCounts) / static_cast(elementsCount); - // [av] WA: there is no point in using sparse decompression when the sparse rate is low - // todo: add heuristic - if (minSparseRate < 0.5) - minSparseRate = 0.5; - DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ", minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate); diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 91431365d59976..371d9d731ed9c9 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -42,7 +42,6 @@ class FullyConnected : public Node { void initSupportedPrimitiveDescriptors() override; void initOptimalPrimitiveDescriptor() override; - // void createPrimitive() override; std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; std::shared_ptr getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; @@ -112,7 +111,6 @@ class FullyConnected : public Node { // sparse weights bool useSparseWeights = false; - int nnzCount = -1; float minSparseRate = 1.f; float weiSparseRate = 0.f; bool useSparseWeightsDecompression(); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp index b9d7028a477bef..2122a7b70720fc 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp @@ -9,6 +9,7 @@ #include "openvino/runtime/core.hpp" #include "openvino/runtime/compiled_model.hpp" #include "openvino/runtime/properties.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" #include @@ -113,6 +114,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThrough ASSERT_EQ(streams, value); } +TEST_F(OVClassConfigTestCPU, smoke_CheckSparseWeigthsDecompressionRate) { + ov::Core core; + + core.set_property(deviceName, ov::intel_cpu::sparse_weights_decompression_rate(0.8)); + ASSERT_NO_THROW(ov::CompiledModel compiledModel = core.compile_model(model, deviceName)); +} + const std::vector multiDevicePriorityConfigs = { {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}}; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp new file mode 100644 index 00000000000000..435f407bbba3ff --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul_sparse.cpp @@ -0,0 +1,343 @@ +// Copyright (C) 2022-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/single_layer/mat_mul.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "ie_precision.hpp" +#include "test_utils/fusing_test_utils.hpp" +#include "ngraph_functions/builders.hpp" +#include +#include +#include "shared_test_classes/base/utils/generate_inputs.hpp" +#include "cpu/cpu_config.hpp" + +using namespace ngraph; +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +struct ShapeRelatedParams { + std::vector inputShapes; + std::pair transpose; +}; + +typedef std::tuple< + ShapeRelatedParams, + ElementType, // Input precision + ElementType, // Weights precision + ElementType, // Output precision + fusingSpecificParams, + CPUSpecificParams, + std::map, // Additional config + float // Weights sparse rate +> MatMulSparseParamSet; + +class MatMulSparseCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CpuTestWithFusing { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ShapeRelatedParams shapeRelatedParams; + ElementType inType, weiType, outType; + fusingSpecificParams fusingParams; + CPUSpecificParams cpuParams; + std::map additionalConfig; + float weiSparseRate; + std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig, + weiSparseRate) = obj.param; + + std::ostringstream result; + result << "IS="; + for (const auto& shape : shapeRelatedParams.inputShapes) { + result << CommonTestUtils::partialShape2str({shape.first}) << "_"; + } + result << "TS="; + for (const auto& shape : shapeRelatedParams.inputShapes) { + result << "("; + if (!shape.second.empty()) { + auto itr = shape.second.begin(); + do { + result << CommonTestUtils::vec2str(*itr); + } while (++itr != shape.second.end() && result << "_"); + } + result << ")_"; + } + result << "transpose_a=" << shapeRelatedParams.transpose.first << "_"; + result << "transpose_b=" << shapeRelatedParams.transpose.second << "_"; + result << "inType=" << inType << "_"; + result << "weiType=" << weiType << "_"; + result << "outType=" << outType << "_"; + result << CpuTestWithFusing::getTestCaseName(fusingParams); + result << CPUTestsBase::getTestCaseName(cpuParams); + + if (!additionalConfig.empty()) { + result << "_PluginConf"; + for (auto& item : additionalConfig) { + result << "_" << item.first << "=" << item.second; + } + } + result << "_weiSparseRate=" << weiSparseRate; + + return result.str(); + } + +protected: + std::string cpuNodeType; + + template + void transpose(T& shape) { + IE_ASSERT(shape.size() > 1); + std::swap(*(shape.end() - 1), *(shape.end() - 2)); + } + + std::vector inline generateSparseVector(size_t vec_len, + float sparseRate = 0.0f, + int8_t upTo = 10, + int8_t startFrom = 1, + int32_t seed = 1) { + std::vector res(vec_len); + std::mt19937 gen(seed); + std::uniform_int_distribution dist(static_cast(startFrom), static_cast(upTo)); + + std::mt19937 gen_f(123); + std::uniform_real_distribution dist_f(0.f, 1.f); + + int countZero = 0; + + res[0] = startFrom; + res[vec_len - 1] = upTo; + for (size_t i = 1; i < vec_len - 1; i++) { + if (dist_f(gen_f) > sparseRate) { + res[i] = static_cast(dist(gen)); + } else { + res[i] = 0; + countZero++; + } + } + + std::cout << "Sparse rate = " << countZero * 100 / vec_len << "%" << std::endl; + + return res; + } + + std::shared_ptr makeMatMulRelaxed(const Output& A, + const ov::PartialShape& inShapeB, + ElementType weiType, + bool transpose_a, + bool transpose_b, + const std::vector& weiData) { + using namespace ngraph; + auto inputParamsFP32 = builder::makeDynamicParams(element::f32, {A.get_partial_shape()}); + auto matrixBFP32 = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB); + + auto matMulRelaxed = std::make_shared>( + *as_type_ptr(builder::makeMatMul(inputParamsFP32[0], matrixBFP32, transpose_a, transpose_b)), + element::f32); + + auto matrixB = ngraph::builder::makeConstant(weiType, inShapeB.get_shape(), weiData); + + auto matMul = matMulRelaxed->copy_with_new_inputs({A, matrixB}); + + return matMul; + } + + void SetUp() override { + abs_threshold = 0.5f; + using ngraph::pass::ConvertPrecision; + + ShapeRelatedParams shapeRelatedParams; + ElementType inType, weiType, outType; + fusingSpecificParams fusingParams; + CPUSpecificParams cpuParams; + std::map additionalConfig; + float weiSparseRate; + + std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig, + weiSparseRate) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + + init_input_shapes(shapeRelatedParams.inputShapes); + + bool transpA = shapeRelatedParams.transpose.first; + bool transpB = shapeRelatedParams.transpose.second; + + if (transpA) { + transpose(inputDynamicShapes[0]); + for (auto& shapes : targetStaticShapes) { + transpose(shapes[0]); + } + } + if (transpB) { + transpose(inputDynamicShapes[1]); + for (auto& shapes : targetStaticShapes) { + transpose(shapes[1]); + } + } + + const auto& inShapeA = inputDynamicShapes[0]; + const auto& inShapeB = inputDynamicShapes[1]; + + std::tie(postOpMgrPtr, fusedOps) = fusingParams; + + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + + cpuNodeType = "FullyConnected"; + selectedType = makeSelectedTypeStr(selectedType, element::i8); + + auto params = builder::makeDynamicParams(inType, {inShapeA}); + auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes(params)); + + auto matrixB = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB); + + auto weiData = generateSparseVector(ngraph::shape_size(inShapeB.get_shape()), weiSparseRate); + auto matMul = makeMatMulRelaxed(paramOuts[0], inShapeB, weiType, transpA, transpB, weiData); + + function = makeNgraphFunction(element::f32, params, matMul, cpuNodeType); + + checkFusingPosition = false; + + targetDevice = CommonTestUtils::DEVICE_CPU; + + functionRefs = ov::clone_model(*function); + ngraph::pass::ConvertPrecision().run_on_function(functionRefs); + ngraph::pass::ConvertPrecision().run_on_function(functionRefs); + functionRefs->validate_nodes_and_infer_types(); + } +}; + +TEST_P(MatMulSparseCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + run(); + CheckPluginRelatedResults(compiledModel, cpuNodeType); +} + +namespace { + +/* ============= Common params ============= */ + +std::vector filterSpecificParams(bool sparseExpected) { + std::vector specificParams; + if (with_cpu_x86_avx512_core_amx()) { + if (sparseExpected) { + specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx_sparse"}); + } else { + specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx"}); + } + } + + return specificParams; +} + +/* ============= FullyConnected ============= */ +namespace fullyConnected { + +// cpu (sparse) configs +const std::map emptyConfig = {}; +const std::map SparseRate50 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.5"}}; +const std::map SparseRate80 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.8"}}; + +const std::vector IS2D_sparse_smoke = { + {static_shapes_to_test_representation({{64, 64}, {64, 64}}), {false, true}}, + {static_shapes_to_test_representation({{71, 64}, {64, 64}}), {false, true}}, + {static_shapes_to_test_representation({{3, 128}, {128, 64}}), {false, true}}, + {static_shapes_to_test_representation({{71, 64}, {64, 128}}), {false, true}}, + + { + { + {{-1, -1}, {{20, 64}, {20, 64}}}, + {{64, 128}, {{64, 128}, {64, 128}}} + }, + {false, true} + }, + + { + { + {{{0, 100}, {0, 64}}, {{20, 64}, {14, 64}, {20, 64}, {14, 64}}}, + {{64, 128}, {{64, 128}, {64, 128}, {64, 128}, {64, 128}}} + }, + {false, true} + }, +}; + +const auto testParams2D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke), + ::testing::Values(ElementType::i8, ElementType::u8), + ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::f32), + ::testing::Values(emptyFusingSpec), + ::testing::ValuesIn(filterSpecificParams(false)), + ::testing::Values(emptyConfig, SparseRate80), + ::testing::Values(0.7)); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8, MatMulSparseCPUTest, testParams2D_i8_smoke, + MatMulSparseCPUTest::getTestCaseName); + +const auto testParams2D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke), + ::testing::Values(ElementType::i8, ElementType::u8), + ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::f32), + ::testing::Values(emptyFusingSpec), + ::testing::ValuesIn(filterSpecificParams(true)), + ::testing::Values(SparseRate50), + ::testing::Values(0.7)); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8_sparse, MatMulSparseCPUTest, testParams2D_i8_sparse_smoke, + MatMulSparseCPUTest::getTestCaseName); + +const std::vector IS3D_sparse_smoke = { + {static_shapes_to_test_representation({{1, 64, 64}, {64, 64}}), {false, true}}, + {static_shapes_to_test_representation({{3, 71, 64}, {64, 64}}), {false, true}}, + {static_shapes_to_test_representation({{3, 5, 128}, {128, 64}}), {false, true}}, + {static_shapes_to_test_representation({{1, 71, 64}, {64, 128}}), {false, true}}, + + { + { + {{-1, -1, 64}, {{1, 5, 64}, {1, 10, 64}, {1, 5, 64}, {1, 10, 64}}}, + {{64, 128}, {{64, 128}, {64, 128}}} + }, + {false, true} + }, + + // todo: [av] investigate "Primitive descriptor was not found" error for this case + // { + // { + // {{{0, 60}, {0, 60}, {0, 64}}}, {{1, 3, 64}, {1, 7, 64}}}, + // {{64, 64}, {{64, 64}, {64, 64}}} + // }, + // {false, true} + // }, +}; + +const auto testParams3D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke), + ::testing::Values(ElementType::i8, ElementType::u8), + ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::f32), + ::testing::Values(emptyFusingSpec), + ::testing::ValuesIn(filterSpecificParams(false)), + ::testing::Values(emptyConfig, SparseRate80), + ::testing::Values(0.7)); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8, MatMulSparseCPUTest, testParams3D_i8_smoke, + MatMulSparseCPUTest::getTestCaseName); + +const auto testParams3D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke), + ::testing::Values(ElementType::i8, ElementType::u8), + ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::f32), + ::testing::Values(emptyFusingSpec), + ::testing::ValuesIn(filterSpecificParams(true)), + ::testing::Values(SparseRate50), + ::testing::Values(0.7)); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8_sparse, MatMulSparseCPUTest, testParams3D_i8_sparse_smoke, + MatMulSparseCPUTest::getTestCaseName); + +} // namespace fullyConnected + +} // namespace + +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 26ad0022000d8d..0134954ed3f6b6 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 26ad0022000d8de10a683efb175c2acbf064e81b +Subproject commit 0134954ed3f6b6b90eab687224b584f3c0c531df From 9c199a12b2fab159ce51b1736b2678a9121a0908 Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Thu, 5 Jan 2023 14:57:15 +0000 Subject: [PATCH 2/2] code style fix --- .../include/openvino/runtime/intel_cpu/properties.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index a3253785078a40..9d63a0e078bdef 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -51,11 +51,11 @@ static constexpr Property denormals_optimization{"CPU_DENORMALS_OPTIMIZATI * @brief This property defines threshold for sparse weights decompression feature activation * @ingroup ov_runtime_cpu_prop_cpp_api * - * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU plugin - * at the model compilation stage and store non-zero values in a special packed format. Then, during the execution of the model, - * the weights are unpacked and used in the computational kernel. Since the weights are loaded from DDR/L3 cache in the packed - * format this significantly decreases memory consumption and as a consequence improve inference performance. - * The following code allows to set the sparse rate value. + * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU + * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the + * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded + * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve + * inference performance. The following code allows to set the sparse rate value. * * @code * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));