From d089473926faddbc8ce1c9fe3296a656a0582f33 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Wed, 31 Jul 2024 01:21:21 +0100 Subject: [PATCH] [CPU] [ARM] INT8 FullyConnected --- .../src/nodes/executors/acl/acl_gemm.cpp | 1 + .../src/nodes/executors/acl/acl_gemm.hpp | 2 +- .../executors/acl/acl_lowp_fullyconnected.cpp | 133 ++++++++++++++++++ .../executors/acl/acl_lowp_fullyconnected.hpp | 49 +++++++ .../fullyconnected_implementations.cpp | 36 +++++ .../pass/snippets_mark_skipped_base.cpp | 1 - .../fully_connected_transformation.cpp | 52 +++++++ 7 files changed, 272 insertions(+), 2 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp index 003310b652c35f..020be8b106cf81 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp @@ -75,6 +75,7 @@ ACLFunction ACLGEMMExecutor::configureFunction(const ACLMemoryTensors & aclMemor return gemm; } +// TODO: move to ACLLowpExecutor ACLInfo ACLGEMMExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape, const arm_compute::DataType& dataType, const arm_compute::DataLayout& dataLayout) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp index a1a537da7b6a1d..ea3fe5715c697e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp new file mode 100644 index 00000000000000..5ca6edde088130 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp @@ -0,0 +1,133 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_lowp_fullyconnected.hpp" + +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" + +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/debug_capabilities.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/implementation_utils.hpp" +#include "acl_weights.hpp" +#include "acl_utils.hpp" + +namespace ov { +namespace intel_cpu { + +static void initFCAttrs(const FCAttrs &attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs &memory, + arm_compute::GEMMInfo& gemmInfo, + const PostOps &postOps) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); + //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); + aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); + //fullyConnectedLayerInfo.transpose_weights = false; + // ?? + gemmInfo.set_pretranspose_B(false); + aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; + + // Add postops + if (!postOps.empty() && postOps.size() == 1) { + if (const auto activation = std::dynamic_pointer_cast(postOps[0])) { + gemmInfo.set_activation_info(getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma())); + } + } + + if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { + aclfcAttrs.isConvertedWeights = true; + } +} + +ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs, + const PostOps &postOps, + const MemoryArgs &memory, + const ExecutorContext::CPtr context) { + initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps); +} + +bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) { + // TODO: check weights layout + const auto attrs = static_cast(config.attrs); + if (std::any_of( + attrs.dequantizationScales.begin(), + attrs.dequantizationScales.end(), + [](float value) { return value != 1.f;})) { + return false; + } + + const auto src1_dims = std::dynamic_pointer_cast(config.descs.at(ARG_SRC))->getBlockDims(); + const auto src2_dims = std::dynamic_pointer_cast(config.descs.at(ARG_WEI))->getBlockDims(); + + const auto precision = srcType(config); + VERIFY(one_of(precision, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS); + VERIFY(postOpsNumbers(config) == 0, UNSUPPORTED_NUMBER_OF_POSTOPS); + VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); + VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK); + VERIFY(static_cast(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION); + return true; +} + +void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) { + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); +} + +arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) { + const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate( + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + gemmInfo); + return matMulValid; +} + +ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) { + auto gemm = std::make_unique(); + gemm->configure( + aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_WEI].get(), + nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(), + aclMemoryTensors.at(ACLArgs::ACL_DST).get(), + gemmInfo); + + if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) { + aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false; + aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData()); + } + return gemm; +} + +// TODO: move to ACLLowpExecutor +ACLInfo ACLLowpFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) { + arm_compute::DataType result; + switch (dataType) { + case arm_compute::DataType::S8: { + result = arm_compute::DataType::QASYMM8_SIGNED; + break; + } + case arm_compute::DataType::U8: { + result = arm_compute::DataType::QASYMM8; + break; + } + default: { + result = dataType; + break; + } + } + + return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp new file mode 100644 index 00000000000000..2e246a65ceb78b --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "acl_common_executor.hpp" +#include "nodes/executors/fullyconnected_config.hpp" +#include "acl_weights.hpp" + +namespace ov { +namespace intel_cpu { + +class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor { +public: + ACLLowpFullyConnectedExecutor(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context); + + static bool supports(const FCConfig& config); + + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override; + + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; + + impl_desc_type implType() const override { + return impl_desc_type::gemm_acl; + } + +protected: + ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) override; + +private: + arm_compute::GEMMInfo gemmInfo; + arm_compute::WeightsInfo weightsInfo; + + MemoryCPtr packedWeights; + ACLFCAttrs aclfcAttrs; +}; + +using ACLLowpFullyConnectedExecutorPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 0be6dd9be5f568..6e445a2c6020cc 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -28,6 +28,7 @@ #if defined(OV_CPU_WITH_ACL) #include "nodes/executors/acl/acl_fullyconnected.hpp" +#include "nodes/executors/acl/acl_lowp_fullyconnected.hpp" #endif #if defined(OV_CPU_WITH_SHL) @@ -82,6 +83,11 @@ static const TypeMapping dnnlFCTypeMapping { static const TypeMapping aclFCTypeMapping { // {src, wei, bia, dst} pt {{_f32 | _f16, _f32 | _f16, _any, _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, + {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} +}; + +static const TypeMapping aclLowpFCTypeMapping { + // {src, wei, bia, dst} pt {{_i8, _i8, _any, _any}, pt(just(), just(), just(), just())}, {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} }; @@ -352,6 +358,36 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context) { return std::make_shared(attrs, postOps, memory, context); }) + OV_CPU_INSTANCE_ACL( + "fullyconnected_acl_lowp", + ExecutorType::Acl, + OperationType::FullyConnected, + ShapeTolerance::Agnostic, + // supports + [](const FCConfig& config) -> bool { + VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS); + VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION); + return ACLLowpFullyConnectedExecutor::supports(config); + }, + // requiresFallback + [](const FCConfig& config) -> ov::optional> { + return requiresFallbackCommon(config, + aclLowpFCTypeMapping, + aclFCLayoutConfig, + aclFullyConnectedMappingNotation); + }, + // acceptsShapes + [](const MemoryArgs& memory) -> bool { + // @todo create syntactic sugar (functor) for shape agnostic lambda + return true; + }, + // create + [](const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context) { + return std::make_shared(attrs, postOps, memory, context); + }) OV_CPU_INSTANCE_SHL( "fullyconnected_shl", ExecutorType::Shl, diff --git a/src/plugins/intel_cpu/src/transformations/snippets/common/pass/snippets_mark_skipped_base.cpp b/src/plugins/intel_cpu/src/transformations/snippets/common/pass/snippets_mark_skipped_base.cpp index e02f48d5a94909..4c892b00a50cfd 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/common/pass/snippets_mark_skipped_base.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/common/pass/snippets_mark_skipped_base.cpp @@ -5,7 +5,6 @@ #include "snippets/pass/tokenization.hpp" #include "snippets/op/subgraph.hpp" -#include "snippets/utils.hpp" #include "transformations/utils/utils.hpp" #include "transformations/utils.hpp" diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp new file mode 100644 index 00000000000000..2cbda1aa2582d7 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "low_precision_transformations/fully_connected_transformation.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + ov::element::f32 +}; + +const std::vector shapes = { + { + ov::PartialShape{ 1, 16 }, + ov::PartialShape{ 16, 8 }, + false, + false + }, +// { +// ov::PartialShape{ 1, 16 }, +// ov::PartialShape{ 8, 16 }, +// false, +// true +// }, +// { +// ov::PartialShape{ 16, 1 }, +// ov::PartialShape{ 16, 8 }, +// true, +// false +// }, +}; + +const std::vector trasformationParamValues = { + LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams() +}; + +INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(shapes), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::ValuesIn(trasformationParamValues), + ::testing::ValuesIn({ov::element::i8 /*, ov::element::u8*/}), + ::testing::ValuesIn({/*true,*/ false}), + ::testing::Values("gemm_acl_i8")), + FullyConnectedTransformation::getTestCaseName); +} // namespace