diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 6b7f4042c589ec..765b6a1ddd1629 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -91,6 +91,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseFCAndConvertOnWeights(graph); graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndTransposeOnWeights"); + FuseFCAndTransposeOnWeights(graph); + graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseDeconvolutionAndSimpleOperation"); FuseDeconvolutionAndSimpleOperation(graph); graph.RemoveDroppedNodes(); @@ -792,13 +796,13 @@ void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) { } void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { - // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic + // This optimization fuses Convert (fp16/u8 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic // (e.g. fuse conversion with weights reordering) auto& graphNodes = graph.GetNodes(); for (auto parent : graphNodes) { if (parent->getType() == Type::Convert && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && parent->getOriginalInputPrecisionAtPort(0) == Precision::FP16 + && one_of(parent->getOriginalInputPrecisionAtPort(0), Precision::FP16, Precision::U8) && one_of(parent->getOriginalOutputPrecisionAtPort(0), Precision::FP32, Precision::BF16)) { auto childNode = parent->getChildEdgeAt(0)->getChild(); // set correct weight precision @@ -808,6 +812,21 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { } } +void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { + // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node + auto& graphNodes = graph.GetNodes(); + + for (auto parent : graphNodes) { + if (parent->getType() == Type::Transpose && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected + && parent->getOutputShapeAtPort(0).getRank() == 2) { + auto fcNode = std::dynamic_pointer_cast(parent->getChildEdgeAt(0)->getChild()); + fcNode->setTransposeWeights(true); + auto transposeNode = std::dynamic_pointer_cast(parent); + transposeNode->setFakeTranspose(true); + } + } +} + void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { auto& graphNodes = graph.GetNodes(); diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index fe4991deb86fe6..bb6494758f38be 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -27,6 +27,7 @@ class GraphOptimizer { void FuseMultiplyAndAdd(Graph &graph); void MergeConvertAndScaleShift(Graph& graph); void FuseFCAndConvertOnWeights(Graph& graph); + void FuseFCAndTransposeOnWeights(Graph& graph); void FuseFullyConnectedAndSimpleOperation(Graph &graph); void FuseMatMulAndSimpleOperation(Graph &graph); void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph); diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 4cfe9c7d708660..e2ceba0d0eadd3 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -694,6 +694,14 @@ class Node { std::shared_ptr shapeInference; + // we cannot rely on per-NUMA weightCache for caching weights because: + // 1.it may not exist(in single stream configuration) + // 2.it only holds weak references, the life-cycle of cached item + // is still under control of strong references outside of cache. + // privateWeightCache is for holding strong references to constant weight + // copies of same content with different layouts. + std::unordered_map privateWeightCache; + private: std::vector parentEdges; std::vector childEdges; @@ -723,13 +731,6 @@ class Node { ConstantType checkConstant(LOOK look, std::vector& checkNodes); // Hold output scales std::vector DQScales; - // we cannot rely on per-NUMA weightCache for caching weights because: - // 1.it may not exist(in single stream configuration) - // 2.it only holds weak references, the life-cycle of cached item - // is still under control of strong references outside of cache. - // privateWeightCache is for holding strong references to constant weight - // copies of same content with different layouts. - std::unordered_map privateWeightCache; CPU_DEBUG_CAP_ENABLE(friend class Verbose); }; diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 13a205c76b9c5e..1ded31384ab103 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -337,12 +337,14 @@ void FullyConnected::prepackMLASWeight() { MemoryPtr ptr; auto create = [&]() { float* weightPtr = reinterpret_cast(weightsMem->getData()); - size_t ldb = K; + // todo: leave comment about transposeWeights + size_t ldb = transposeWeights ? N : K; MemoryPtr _ptr = std::make_shared(getEngine(), intel_cpu::CpuBlockedMemoryDesc(Precision::I8, intel_cpu::Shape{packedBsize})); float* prepackedDst = reinterpret_cast(_ptr->getData()); - mlas_sgemm_pack("T", N, K, ldb, weightPtr, prepackedDst); + // todo: leave detailed comment (double transpose = do nothing) + mlas_sgemm_pack(transposeWeights ? "F" : "T", N, K, ldb, weightPtr, prepackedDst); return _ptr; }; @@ -513,7 +515,11 @@ void FullyConnected::prepareParams() { } if (!prevExecPtr || !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) { - primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->getPrimitive(); + if (transposeWeights) { + primArgs[DNNL_ARG_WEIGHTS] = prepareTransposedWeightMemory(execPtr->getWeightDesc())->getPrimitive(); + } else { + primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->getPrimitive(); + } } // changed shapes may also cause the kernel type changed selected_pd->setImplementationType(execPtr->getImplementationType()); @@ -1106,6 +1112,50 @@ void FullyConnected::fuseDecompressionConstant(const NodePtr& constData, std::ve Precision::FP32, elementsCount); } + +MemoryPtr FullyConnected::prepareTransposedWeightMemory(DnnlMemoryDescPtr weightDesc) { + if (!getParentEdgeAt(1)->getParent()->isConstant()) + IE_THROW() << "Weight input is not const for node " << getName() << "."; + auto edgeMem = getParentEdgeAt(1)->getMemoryPtr(); + if (!edgeMem) + IE_THROW() << "Cannot get const weights edgeMem for node " << getName() << "."; + + auto constDnnlMemOutDesc = edgeMem->getDescWithType(); + auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc(); + weightSrcDesc = {weightSrcDesc.get_dims(), weightSrcDesc.get_data_type(), memory::format_tag::ba}; + weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims()); + + auto create = [&] () { + auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc); + + Memory srcMemory{ getEngine(), newSrcDesc, edgeMem->getData() }; + MemoryPtr _ptr = std::make_shared(getEngine(), weightDesc); + node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache()); + + return _ptr; + }; + + MemoryPtr ptr; + const auto& format = weightDesc->serializeFormat(); + auto itr = privateWeightCache.find(format); + if (privateWeightCache.end() != itr) { + ptr = itr->second; + } else { + auto weightCache = context->getWeightsCache(); + if (weightCache != nullptr) { + const std::string string_hash = getName() + "_" + format + + "_" + std::to_string(edgeMem->getSize()) + + "_" + std::to_string(reinterpret_cast(edgeMem->getData())); + + ptr = *weightCache->findOrCreate(string_hash, create); + } else { + ptr = create(); + } + privateWeightCache[format] = ptr; + } + + return ptr; +} } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 2f9a11e5b57005..70ace5e4308bdb 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -56,6 +56,9 @@ class FullyConnected : public Node { void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; bool canBeExecutedInInt8() const override; + void setTransposeWeights(bool transpose) { + transposeWeights = transpose; + } void fuseDecompressionMultiply(const NodePtr& constData); const std::vector& getDecompressionMultiply() const { return decompressionMultiply; } @@ -117,6 +120,11 @@ class FullyConnected : public Node { std::vector decompressionSubtract; std::vector decompressionMultiply; + + // FC with transposed weights + bool transposeWeights = false; + // this method is using to prepare transposed memory + MemoryPtr prepareTransposedWeightMemory(DnnlMemoryDescPtr weightDesc); }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index 5bc502410163dc..835115ff337044 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -140,7 +140,7 @@ void Transpose::initSupportedPrimitiveDescriptors() { config.inConfs[INPUT_ORDER_IDX].constant(isInputOrderConst); config.inConfs[INPUT_ORDER_IDX].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc( Precision::I32, getInputShapeAtPort(INPUT_ORDER_IDX))); - config.outConfs[0].inPlace(-1); + config.outConfs[0].inPlace(fakeTranspose ? 0 : -1); config.outConfs[0].constant(false); transpose_context = std::make_shared(context, getImplPriority()); @@ -287,6 +287,8 @@ void Transpose::createPrimitive() { } void Transpose::execute(dnnl::stream strm) { + if (fakeTranspose) + return; if (prim) { prim.execute(strm, primArgs); } else if (execPtr) { diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h index 5fb7e9f76570bf..a09c929d58c7c9 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.h +++ b/src/plugins/intel_cpu/src/nodes/transpose.h @@ -38,6 +38,10 @@ class Transpose : public Node { bool needPrepareParams() const override; void prepareParams() override; + void setFakeTranspose(bool fake) { + fakeTranspose = fake; + } + protected: void executeDynamicImpl(dnnl::stream strm) override; std::shared_ptr transpose_context; @@ -56,6 +60,7 @@ class Transpose : public Node { static constexpr size_t INPUT_ORDER_IDX = 1lu; bool performAsReorder = false; + bool fakeTranspose = false; }; } // namespace node diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp index 81a7def84adaa6..cf6a97e3f7fb71 100644 --- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp +++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp @@ -14,6 +14,7 @@ using namespace ov::test; namespace SubgraphTestsDefinitions { +// todo: /* This test checks that the ConvertMatMulToFC transformation should work and the MatMul node is converted to the FC node. * The Convert node should be removed on the CPU plugin side. @@ -49,9 +50,11 @@ namespace SubgraphTestsDefinitions { */ using MatMulDecompressConvertParams = std::tuple< - std::vector, // input shapes - std::pair, // transposeA, transposeB - std::map // additional config + std::vector, // input shapes + std::pair, // transposeA, transposeB + ElementType, // weights precision + std::map, // additional config + CPUSpecificParams >; class MatMulDecompressConvertTest : public testing::WithParamInterface, @@ -60,9 +63,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface obj) { std::vector inputShapes; std::pair transpose; + ElementType weiElemType; std::map additionalConfig; + CPUSpecificParams cpuParams; - std::tie(inputShapes, transpose, additionalConfig) = obj.param; + std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param; std::ostringstream result; for (const auto& shape : inputShapes) { @@ -82,12 +87,16 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface std::string { auto it = rtInfo.find(paramName); IE_ASSERT(rtInfo.end() != it); @@ -110,8 +119,8 @@ class MatMulDecompressConvertTest : public testing::WithParamInterfaceget_ops()) { if (getExecValue(fcNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE) == "FullyConnected") { const auto &constNode = fcNode->get_input_node_shared_ptr(1); - ASSERT_EQ(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE), "Const"); - ASSERT_EQ(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS), "FP16"); + element::Type expectedType(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS)); + ASSERT_EQ(expectedType, weiConstElemType); } } } @@ -122,14 +131,19 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface inputShapes; std::pair transpose; std::map additionalConfig; + CPUSpecificParams cpuParams; - std::tie(inputShapes, transpose, additionalConfig) = this->GetParam(); + std::tie(inputShapes, transpose, weiConstElemType, additionalConfig, cpuParams) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; init_input_shapes(inputShapes); bool transpA = transpose.first; bool transpB = transpose.second; + if (transpA) transposesCount++; + if (!transpB) transposesCount++; + if (transpA) { transposeShape(inputDynamicShapes[0]); for (auto& shapes : targetStaticShapes) { @@ -148,32 +162,47 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface(params)); - - auto matrixB = ngraph::builder::makeConstant(element::f16, inShapeB.get_shape(), {}, true); - auto convert = std::make_shared(matrixB, inType); - mark_as_decompression(convert); - auto matMul = builder::makeMatMul(paramOuts[0], convert, transpA, transpB); + std::shared_ptr inputB = builder::makeConstant(weiConstElemType, inShapeB.get_shape(), {}, true); + if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::u8) { + inputB = std::make_shared(inputB, convertOutType); + mark_as_decompression(inputB); + } + auto matMul = builder::makeMatMul(paramOuts[0], inputB, transpA, transpB); function = CPUTestsBase::makeNgraphFunction(netType, params, matMul, cpuNodeType); } + + void CheckExecutionGraph() { + CheckPluginRelatedResults(compiledModel, "FullyConnected"); + CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1); + CheckNumberOfNodesWithType(compiledModel, "Transpose", transposesCount); + CheckNumberOfNodesWithType(compiledModel, "Convert", 0); + CheckNumberOfNodesWithType(compiledModel, "Reorder", 0); + CheckFCWeightsPrecision(); + } + + size_t transposesCount = 0; + ElementType weiConstElemType = ElementType::f32; }; TEST_P(MatMulDecompressConvertTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); run(); - CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1); - CheckNumberOfNodesWithType(compiledModel, "Convert", 0); - CheckNumberOfNodesWithType(compiledModel, "Reorder", 0); - CheckConstFP16(); + CheckExecutionGraph(); } namespace { @@ -185,32 +214,148 @@ const std::vector> transposeParams = { {true, true}, }; +const std::vector> inputShapes2D = { + static_shapes_to_test_representation({{2, 3}, {3, 4}}), + { + {{-1, -1}, {{2, 3}, {5, 3}}}, + {{3, 4}, {{3, 4}, {3, 4}}} + }, +}; + +const std::vector> inputShapes3D = { + static_shapes_to_test_representation({{2, 2, 3}, {3, 4}}), + static_shapes_to_test_representation({{2, 3}, {1, 3, 4}}), + static_shapes_to_test_representation({{1, 2, 3}, {1, 3, 4}}), + { + {{-1, -1, -1}, {{2, 2, 3}, {3, 5, 3}}}, + {{3, 4}, {{3, 4}, {3, 4}}} + }, + { + {{-1, -1}, {{2, 3}, {5, 3}}}, + {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}} + }, + { + {{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}}, + {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}} + }, +}; + + std::vector> filterAdditionalConfig() { std::vector> additionalConfig; +#ifndef OV_CPU_WITH_MLAS additionalConfig.push_back(std::map{/* empty config */}); +#endif + return additionalConfig; +} + +std::vector> filterAdditionalConfig_BF16() { + std::vector> additionalConfig; if (with_cpu_x86_avx512_core()) { additionalConfig.push_back({{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}); } + return additionalConfig; +} +std::vector> filterAdditionalConfig_MLAS() { + std::vector> additionalConfig; + additionalConfig.push_back(std::map{/* empty config */}); return additionalConfig; } +std::vector filterSpecificParams() { + std::vector specificParams; + if (with_cpu_x86_avx512_core()) { + specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512"}, "brgemm_avx512"}); + } else if (with_cpu_x86_avx2()) { + specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx2"}, "brgemm_avx2"}); + } + return specificParams; +} + + +std::vector filterSpecificParams_BF16() { + std::vector specificParams; + specificParams.push_back(CPUSpecificParams{{}, {}, {"jit_gemm"}, "jit_gemm"}); + return specificParams; +} + + +std::vector filterSpecificParams_MLAS() { + std::vector specificParams; + specificParams.push_back(CPUSpecificParams{{}, {}, {"gemm_mlas"}, "gemm_mlas"}); + return specificParams; +} + + +#ifdef OV_CPU_WITH_MLAS +const auto testParams2D_MLAS_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes2D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32), + ::testing::ValuesIn(filterAdditionalConfig_MLAS()), + ::testing::ValuesIn(filterSpecificParams_MLAS())); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_MLAS, MatMulDecompressConvertTest, testParams2D_MLAS_smoke, + MatMulDecompressConvertTest::getTestCaseName); +#endif + + const auto testParams2D_smoke = ::testing::Combine( - ::testing::Values(static_shapes_to_test_representation({{2, 3}, {3, 4}})), - ::testing::ValuesIn(transposeParams), - ::testing::ValuesIn(filterAdditionalConfig())); + ::testing::ValuesIn(inputShapes2D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8), + ::testing::ValuesIn(filterAdditionalConfig()), + ::testing::ValuesIn(filterSpecificParams())); INSTANTIATE_TEST_SUITE_P(smoke_FC_2D, MatMulDecompressConvertTest, testParams2D_smoke, - MatMulDecompressConvertTest::getTestCaseName); + MatMulDecompressConvertTest::getTestCaseName); + + +const auto testParams2D_BF16_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes2D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8), + ::testing::ValuesIn(filterAdditionalConfig_BF16()), + ::testing::ValuesIn(filterSpecificParams_BF16())); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16, MatMulDecompressConvertTest, testParams2D_BF16_smoke, + MatMulDecompressConvertTest::getTestCaseName); + + +#ifdef OV_CPU_WITH_MLAS +const auto testParams3D_MLAS_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes3D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32), + ::testing::ValuesIn(filterAdditionalConfig_MLAS()), + ::testing::ValuesIn(filterSpecificParams_MLAS())); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_MLAS, MatMulDecompressConvertTest, testParams3D_MLAS_smoke, + MatMulDecompressConvertTest::getTestCaseName); +#endif + const auto testParams3D_smoke = ::testing::Combine( - ::testing::Values(static_shapes_to_test_representation({{1, 2, 3}, {3, 4}}), - static_shapes_to_test_representation({{2, 3}, {1, 3, 4}})), - ::testing::ValuesIn(transposeParams), - ::testing::ValuesIn(filterAdditionalConfig())); + ::testing::ValuesIn(inputShapes3D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8), + ::testing::ValuesIn(filterAdditionalConfig()), + ::testing::ValuesIn(filterSpecificParams())); INSTANTIATE_TEST_SUITE_P(smoke_FC_3D, MatMulDecompressConvertTest, testParams3D_smoke, - MatMulDecompressConvertTest::getTestCaseName); + MatMulDecompressConvertTest::getTestCaseName); + + +const auto testParams3D_BF16_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes3D), + ::testing::ValuesIn(transposeParams), + ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8), + ::testing::ValuesIn(filterAdditionalConfig_BF16()), + ::testing::ValuesIn(filterSpecificParams_BF16())); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_BF16, MatMulDecompressConvertTest, testParams3D_BF16_smoke, + MatMulDecompressConvertTest::getTestCaseName); } // namespace