From 9bb8a83ae564776c9108aa83d6109c87fe33eb60 Mon Sep 17 00:00:00 2001 From: Anton Voronov Date: Tue, 22 Nov 2022 05:37:38 -0800 Subject: [PATCH] [CPU][EXPERIMENTAL] FullyConnected: enabled sparsity weights decompression --- .../pyopenvino/core/properties/properties.cpp | 3 + src/inference/include/ie/cpu/cpu_config.hpp | 2 + .../openvino/runtime/intel_cpu/properties.hpp | 2 + src/plugins/intel_cpu/src/config.cpp | 14 ++++ src/plugins/intel_cpu/src/config.h | 1 + src/plugins/intel_cpu/src/graph.cpp | 12 +++ src/plugins/intel_cpu/src/graph.h | 1 + src/plugins/intel_cpu/src/node.cpp | 1 + .../intel_cpu/src/nodes/fullyconnected.cpp | 82 ++++++++++++++++++- .../intel_cpu/src/nodes/fullyconnected.h | 10 +++ .../intel_cpu/src/onednn/iml_type_mapper.cpp | 2 + .../intel_cpu/src/onednn/iml_type_mapper.h | 3 + src/plugins/intel_cpu/thirdparty/onednn | 2 +- 13 files changed, 130 insertions(+), 5 deletions(-) diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 6b86f5abccce9c..5f86edb7258ef1 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -71,6 +71,9 @@ void regmodule_properties(py::module m) { // Submodule intel_cpu property wrap_property_RW(m_intel_cpu, ov::intel_cpu::denormals_optimization, "denormals_optimization"); + wrap_property_RW(m_intel_cpu, + ov::intel_cpu::sparse_weights_decompression_rate, + "sparse_weights_decompression_rate"); // Submodule device py::module m_device = diff --git a/src/inference/include/ie/cpu/cpu_config.hpp b/src/inference/include/ie/cpu/cpu_config.hpp index 18244c2e746fc0..e26f25b540ecd2 100644 --- a/src/inference/include/ie/cpu/cpu_config.hpp +++ b/src/inference/include/ie/cpu/cpu_config.hpp @@ -40,5 +40,7 @@ namespace CPUConfigParams { */ DECLARE_CPU_CONFIG_KEY(DENORMALS_OPTIMIZATION); +DECLARE_CPU_CONFIG_KEY(SPARSE_WEIGHTS_DECOMPRESSION_RATE); + } // namespace CPUConfigParams } // namespace InferenceEngine diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index 815fde5ebc35ff..e1d0a6eefab1ed 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -47,5 +47,7 @@ namespace intel_cpu { */ static constexpr Property denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"}; +static constexpr Property sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 139466fa3f8c87..eea069eff7aa35 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -74,6 +74,20 @@ void Config::readProperties(const std::map &prop) { // zero and any negative value will be treated // as default batch size batchLimit = std::max(val_i, 0); + } else if (key == CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE) { + float val_f = 0.0f; + try { + val_f = std::stof(val); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE + << ". Expected only float numbers"; + } + if (val_f < 0.f || val_f > 1.f) { + IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE + << ". Sparse rate must be in range [0.0f,1.0f]"; + } else { + fcSparseWeiDecompressionRate = val_f; + } } else if (key == PluginConfigParams::KEY_PERF_COUNT) { if (val == PluginConfigParams::YES) collectPerfCounters = true; else if (val == PluginConfigParams::NO) collectPerfCounters = false; diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 1bef54cad35126..f9ac17e84751c9 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -34,6 +34,7 @@ struct Config { bool enableDynamicBatch = false; std::string dumpToDot = ""; int batchLimit = 0; + float fcSparseWeiDecompressionRate = 1.0f; size_t rtCacheCapacity = 5000ul; InferenceEngine::IStreamsExecutor::Config streamExecutorConfig; InferenceEngine::PerfHintsConfig perfHintsConfig; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index c7a37c16643f74..731a97ae395967 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -26,6 +26,7 @@ #include #include "nodes/convert.h" #include "nodes/subgraph.h" +#include "nodes/fullyconnected.h" #include #include @@ -341,6 +342,9 @@ void Graph::Replicate(const CNNNetwork &network, const ExtensionManager::Ptr& ex if (config.enforceBF16) EnforceBF16(); + if (config.fcSparseWeiDecompressionRate < 1.0f) + setMinSparseRate(config.fcSparseWeiDecompressionRate); + auto hasSubgraphConsumers = [] (const NodePtr& node) -> bool { const auto & childEdges = node->getChildEdges(); return std::any_of(childEdges.begin(), childEdges.end(), @@ -1454,6 +1458,14 @@ void Graph::EnforceBF16() { } } +void Graph::setMinSparseRate(float minSparseRate) { + for (const auto &node : graphNodes) { + if (auto fcNodePtr = std::dynamic_pointer_cast(node)) { + fcNodePtr->setMinSparseRate(minSparseRate); + } + } +} + std::shared_ptr Graph::dump() const { return dump_graph_as_ie_ngraph_net(*this); } diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index 726d308dcd020b..46817c184bbe0b 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -268,6 +268,7 @@ class Graph { DnnlScratchPadPtr rtScratchPad; void EnforceBF16(); + void setMinSparseRate(float minSparseRate); }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 9cf8747b49263f..3ceeae194c929b 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -418,6 +418,7 @@ std::string Node::getPrimitiveDescriptorType() { SEARCH_TYPE(uni); SEARCH_TYPE(winograd); + SEARCH_TYPE(sparse); SEARCH_TYPE(_dw); SEARCH_TYPE(_1x1); diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 6d2b47ddcf1c3c..7c44af61509065 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -4,6 +4,7 @@ #include "fullyconnected.h" #include "eltwise.h" +#include "input.h" #include "fake_quantize.h" #include "input.h" #include "reorder.h" @@ -22,6 +23,7 @@ #include #include #include "onednn/dnnl.h" +#include "cpu/x64/cpu_isa_traits.hpp" using namespace dnnl; using namespace InferenceEngine; @@ -172,6 +174,8 @@ void FullyConnected::getSupportedDescriptors() { if (getChildEdges().empty()) IE_THROW()<< errorPrefix << " has incorrect number of output edges"; + useSparseWeights = useSparseWeightsDecompression(); + auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID)); outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID)); @@ -360,6 +364,10 @@ void FullyConnected::prepareParams() { } // changed shapes may also cause the kernel type changed selected_pd->setImplementationType(execPtr->getImplementationType()); + // WA: We update implType to know whether weights decompression was used inside the kernel + if (selected_pd->getImplementationType() == ov::intel_cpu::brgemm_avx512_amx && useSparseWeights) { + selected_pd->setImplementationType(ov::intel_cpu::brgemm_sparse_avx512_amx); + } // maybe expected 1x1 conv is not created, update the flag depends on the real type useConv1x1 = execPtr->getImplementationType() == brgconv_avx512_1x1; @@ -503,6 +511,7 @@ bool FullyConnected::created() const { const std::vector& FullyConnected::getPrimitivesPriority() { std::vector priorities = { impl_desc_type::unknown, + impl_desc_type::brgemm_sparse_avx512_amx, impl_desc_type::brgemm_avx512_amx, impl_desc_type::brgemm_avx512, impl_desc_type::gemm_blas, @@ -578,9 +587,15 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes DnnlExtensionUtils::GetPlainFormatByRank(normalizedOutDims.size())); } - dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), - wdt, dnnl::memory::format_tag::any); - + // We need to explicitly specify the memory descriptor to use sparse weights decompression + dnnl::memory::desc wgh_candidate; + if (useSparseWeights) { + wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), + wdt, memory::desc::packed(nnzCount) }; + } else { + wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), + wdt, dnnl::memory::format_tag::any }; + } if (withBiases) { dnnl::memory::desc bias_candidate(DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(BIAS_ID).getStaticDims()), bdt, dnnl::memory::format_tag::any); @@ -634,7 +649,7 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { portConfig.inPlace(-1); portConfig.constant(false); auto desc = getSrcMemDesc(itpd, i); - if (supportsUndefStridesAndOffset()) { + if (supportsUndefStridesAndOffset() && !(i == WEIGHTS_ID && useSparseWeights)) { portConfig.setMemDesc(std::dynamic_pointer_cast(desc), BLOCKED_DESC_EMPTY_MASK); } else { portConfig.setMemDesc(desc); @@ -868,6 +883,65 @@ MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) { return ptr; } +bool FullyConnected::useSparseWeightsDecompression() { + // minSparseRate == 1 means that sparse feature is switched off + if (minSparseRate == 1.f) { + return false; + } + + if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx)) + return false; + + auto weiDims = getInputShapeAtPort(WEIGHTS_ID).getStaticDims(); + if (weiDims.size() != 2 || weiDims[0] % 64 != 0 || weiDims[1] % 64 != 0) { + return false; + } + + auto inputPrecision = getOriginalInputPrecisionAtPort(DATA_ID); + auto weightsPrecision = getOriginalInputPrecisionAtPort(WEIGHTS_ID); + if (!one_of(inputPrecision , Precision::U8, Precision::I8) || weightsPrecision != Precision::I8) { + return false; + } + + // calculate sparse rate + const auto constNode = std::dynamic_pointer_cast(getParentEdgeAt(WEIGHTS_ID)->getParent()); + if (!constNode) { + return false; + } + auto blb = constNode->getMemoryPtr(); + if (blb == nullptr) + IE_THROW() << "Cannot get const blob for node " << getName() << "."; + + auto weightsData = reinterpret_cast(blb->GetPtr()); + auto elementsCount = blb->GetDescWithType()->getPaddedElementsCount(); + size_t zerosCounts = 0; + for (int i = 0; i < elementsCount; i++) { + if (weightsData[i] == 0) { + zerosCounts++; + } + } + nnzCount = elementsCount - zerosCounts; + + DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ", + zerosCounts, ", nnzCount = ", nnzCount); + + weiSparseRate = static_cast(zerosCounts) / static_cast(elementsCount); + + // [av] WA: there is no point in using sparse decompression when the sparse rate is low + // todo: add heuristic + if (minSparseRate < 0.5) + minSparseRate = 0.5; + + DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ", + minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate); + + if (weiSparseRate < minSparseRate) { + return false; + } + + return true; +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 5c99c2aa00e644..91431365d59976 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -42,6 +42,7 @@ class FullyConnected : public Node { void initSupportedPrimitiveDescriptors() override; void initOptimalPrimitiveDescriptor() override; + // void createPrimitive() override; std::shared_ptr getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; std::shared_ptr getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; @@ -58,6 +59,8 @@ class FullyConnected : public Node { void setDynamicBatchLim(int lim) override; + void setMinSparseRate(float sparseRate) { minSparseRate = sparseRate; } + private: void createDescriptorInternal(const dnnl::memory::desc &inputDesc, const dnnl::memory::desc &outputDesc); @@ -106,6 +109,13 @@ class FullyConnected : public Node { bool canBeExecutedInConv1x1() const; MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc); + + // sparse weights + bool useSparseWeights = false; + int nnzCount = -1; + float minSparseRate = 1.f; + float weiSparseRate = 0.f; + bool useSparseWeightsDecompression(); }; } // namespace node diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp index ff86544296f70e..ae8495d251673e 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp @@ -37,6 +37,7 @@ impl_desc_type parse_impl_name(std::string impl_desc_name) { SEARCH_WORD(_1x1); SEARCH_WORD(_dw); SEARCH_WORD(reorder); + SEARCH_WORD(sparse); if ((res & impl_desc_type::avx2) != impl_desc_type::avx2 && (res & impl_desc_type::avx512) != impl_desc_type::avx512) SEARCH_WORD(avx); @@ -108,6 +109,7 @@ const char* impl_type_to_string(impl_desc_type type) { CASE(brgemm_sse42); CASE(brgemm_uni); CASE(brgemm_avx512_amx); + CASE(brgemm_sparse_avx512_amx); #undef CASE return "unknown"; diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h index 9ae0518fd457c7..0cb658c86723da 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h @@ -35,6 +35,8 @@ enum impl_desc_type { reorder = 1<<22, // winograd winograd = 1<<23, + // sparse + sparse = 1<<24, // real types ref_any = ref | any, @@ -90,6 +92,7 @@ enum impl_desc_type { brgemm_sse42 = brgemm | sse42, brgemm_uni = brgemm | uni, brgemm_avx512_amx = brgemm | avx512 | amx, + brgemm_sparse_avx512_amx = brgemm | sparse | avx512 | amx, }; const char * impl_type_to_string(impl_desc_type type); diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index dbe732c9102fa6..be8d5d21994bf8 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit dbe732c9102fa61f5eae58028215cfc571904baf +Subproject commit be8d5d21994bf8495a04ee58da3f0d566e695db5