From c2d4099d3be1787a8ed0e8e586f9620d811fe874 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Wed, 26 Jun 2024 01:21:15 +0100 Subject: [PATCH] [CPU] [ARM] FullyConnected: int8 support --- .../src/nodes/executors/acl/acl_executor.cpp | 27 ++++++++++-- .../src/nodes/executors/acl/acl_executor.hpp | 6 ++- .../executors/acl/acl_fullyconnected.cpp | 2 +- .../src/nodes/executors/acl/acl_utils.hpp | 41 +++++++++++++++++-- .../fullyconnected_implementations.cpp | 1 + .../fully_connected_transformation.cpp | 3 +- .../fully_connected_transformation.hpp | 3 +- .../fully_connected_transformation.cpp | 20 ++++++--- .../include/ov_lpt_models/mat_mul.hpp | 3 +- .../ov_helpers/ov_lpt_models/src/mat_mul.cpp | 13 ++++-- 10 files changed, 97 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp index 00dee4bda245c3..2316e57a1d8406 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp @@ -3,15 +3,17 @@ // #include "acl_executor.hpp" -#include "acl_utils.hpp" #include "nodes/executors/memory_arguments.hpp" #include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { -ACLMemoryInfo ACLCommonExecutor::initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs) { - auto acl_tensor_type = precisionToAclDataType(memoryPtr->getPrecision()); +ACLMemoryInfo ACLCommonExecutor::initTensorInfo( + const MemoryPtr& memoryPtr, + const ACLTensorAttrs attrs, + const QuantizedDataType quantized) { + auto acl_tensor_type = precisionToAclDataType(memoryPtr->getPrecision(), quantized); auto acl_tensor_layout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr()); ACLMemoryInfo aclMemoryInfo = nullptr; @@ -40,8 +42,25 @@ ACLMemory ACLCommonExecutor::initTensor(const ACLMemoryInfo& aclMemoryInfo) { bool ACLCommonExecutor::update(const MemoryArgs &memory) { for (auto& cpu_mem_ptr : memory) { + auto aclPrecision = precisionToAclDataType(cpu_mem_ptr.second->getPrecision()); + QuantizedDataType quantized; + switch (aclPrecision) { + case arm_compute::DataType::S8: { + quantized = QuantizedDataType::QASYMM; + break; + } + case arm_compute::DataType::U8: { + quantized = QuantizedDataType::QSYMM; + break; + } + default: { + quantized = QuantizedDataType::NONE; + break; + } + } + // Initialize arm_compute::TensorInfo object - auto aclTensorInfo = initTensorInfo(cpu_mem_ptr.second, aclTensorAttrs); + auto aclTensorInfo = initTensorInfo(cpu_mem_ptr.second, aclTensorAttrs, quantized); // Initialize arm_compute::Tensor object aclMemoryMap[cpu_mem_ptr.first] = initTensor(aclTensorInfo); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp index 1b608a8d06115e..aa024007f2f1ee 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp @@ -4,6 +4,7 @@ #pragma once +#include "acl_utils.hpp" #include "cpu_memory.h" #include "nodes/executors/executor.hpp" #include "arm_compute/runtime/NEON/NEFunctions.h" @@ -42,7 +43,10 @@ class ACLCommonExecutor : public Executor { private: ACLMemoryMap aclMemoryMap; ACLFunction iFunction = nullptr; - static ACLMemoryInfo initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs); + static ACLMemoryInfo initTensorInfo( + const MemoryPtr& memoryPtr, + const ACLTensorAttrs attrs, + const QuantizedDataType quantized); static ACLMemory initTensor(const ACLMemoryInfo& aclMemoryInfo); }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index eb628cdeb19c96..b7e3a8453681be 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -35,7 +35,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const } bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { - VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS); + VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8), UNSUPPORTED_SRC_PRECISIONS); VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_SRC_RANK); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp index de9eed5a96bcb5..499974af8e111a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp @@ -96,15 +96,50 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis } } +enum class QuantizedDataType { + NONE, // not quantized + QSYMM, // quantized, symmetric + QASYMM // quantized, asymmetric +}; + /** * @brief Return ComputeLibrary DataType that corresponds to the given precision * @param precision precision to be converted * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType */ -inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) { +inline arm_compute::DataType precisionToAclDataType( + const ov::element::Type& precision, + const QuantizedDataType quantized = QuantizedDataType::NONE) { switch (precision) { - case ov::element::i8: return arm_compute::DataType::S8; - case ov::element::u8: return arm_compute::DataType::U8; + case ov::element::i8: { + switch (quantized) { + case QuantizedDataType::QASYMM: { + return arm_compute::DataType::QASYMM8_SIGNED; + } + case QuantizedDataType::NONE: { + return arm_compute::DataType::S8; + } + default: { + return arm_compute::DataType::UNKNOWN; + } + } + } + case ov::element::u8: { + switch (quantized) { + case QuantizedDataType::QSYMM: { + return arm_compute::DataType::QSYMM8; + } + case QuantizedDataType::QASYMM: { + return arm_compute::DataType::QASYMM8; + } + case QuantizedDataType::NONE: { + return arm_compute::DataType::U8; + } + default: { + return arm_compute::DataType::UNKNOWN; + } + } + } case ov::element::i16: return arm_compute::DataType::S16; case ov::element::u16: return arm_compute::DataType::U16; case ov::element::i32: return arm_compute::DataType::S32; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 36b653baf803f2..18dadaed4ca3e2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -78,6 +78,7 @@ static const TypeMapping dnnlFCTypeMapping { static const TypeMapping aclFCTypeMapping { // {src, wei, bia, dst} pt {{_f32 | _f16, _any, _any, _any}, pt(bypass(), use<0>(), use<0>(), use<0>())}, + {{_i8, _i8, _any, _any}, pt(just(), just(), bypass(), just())}, {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} }; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp index 0368215a5cf5a4..abf7bb7319d31c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp @@ -44,6 +44,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(shapes), ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::ValuesIn(trasformationParamValues)), + ::testing::ValuesIn(trasformationParamValues), + ::testing::ValuesIn({ov::element::i8, ov::element::u8})), FullyConnectedTransformation::getTestCaseName); } // namespace diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp index 731ce44224e33b..8a8bd12a91c0ad 100644 --- a/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp +++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp @@ -20,7 +20,8 @@ typedef std::tuple< ov::element::Type, MatMulShapes, std::string, - ov::pass::low_precision::LayerTransformation::Params> FullyConnectedTransformationParams; + ov::pass::low_precision::LayerTransformation::Params, + ov::element::Type> FullyConnectedTransformationParams; namespace LayerTestsDefinitions { diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp index f72f6d90333613..37c469513f02e6 100644 --- a/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp +++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp @@ -20,14 +20,16 @@ std::string FullyConnectedTransformation::getTestCaseName(const testing::TestPar MatMulShapes shapes; std::string targetDevice; ov::pass::low_precision::LayerTransformation::Params params; - std::tie(precision, shapes, targetDevice, params) = obj.param; + ov::element::Type weightsType; + std::tie(precision, shapes, targetDevice, params, weightsType) = obj.param; std::ostringstream result; result << - get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) << - shapes.inputB << "_" << + get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) << + shapes.inputB << "_" << shapes.transposeA << "_" << - shapes.transposeB; + shapes.transposeB << "_" << + weightsType; return result.str(); } @@ -36,7 +38,8 @@ void FullyConnectedTransformation::SetUp() { ov::element::Type precision; MatMulShapes shapes; ov::pass::low_precision::LayerTransformation::Params params; - std::tie(precision, shapes, targetDevice, params) = this->GetParam(); + ov::element::Type weightsType; + std::tie(precision, shapes, targetDevice, params, weightsType) = this->GetParam(); init_input_shapes({ shapes.inputA, shapes.inputB }); @@ -45,12 +48,17 @@ void FullyConnectedTransformation::SetUp() { shapes.inputA, shapes.inputB, shapes.transposeA, - shapes.transposeB); + shapes.transposeB, + weightsType == ov::element::i8); } TEST_P(FullyConnectedTransformation, CompareWithRefImpl) { SKIP_IF_CURRENT_TEST_IS_DISABLED(); run(); + + const auto actualPrecision = get_runtime_precision_by_type("FullyConnected"); + const auto weightsType = std::get<4>(GetParam()); + EXPECT_EQ(actualPrecision, weightsType.to_string()); }; } // namespace LayerTestsDefinitions diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp index 787e1f6ebe8bd4..b36e5178b88857 100644 --- a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp +++ b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp @@ -27,7 +27,8 @@ class MatMulFunction { const ov::PartialShape inputShape1, const ov::PartialShape inputShape2, const bool transpose1, - const bool transpose2); + const bool transpose2, + const bool signedOnWeights = false); static std::shared_ptr getOriginal( const ov::element::Type precision, diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp index 1b1351ef1b3399..f45d51fffcb930 100644 --- a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp +++ b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp @@ -54,12 +54,17 @@ std::shared_ptr MatMulFunction::getOriginal( const ov::PartialShape inputShape1, const ov::PartialShape inputShape2, const bool transpose1, - const bool transpose2) { + const bool transpose2, + const bool signedOnWeights) { const auto paramNode = std::make_shared(precision, inputShape1); const std::vector constShapes(inputShape1.rank().get_length(), 1ul); - const auto fakeQuantizeOnAcitvations = ov::test::utils::make_fake_quantize( - paramNode, precision, 256ul, constShapes, - { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f }); + const auto fakeQuantizeOnAcitvations = signedOnWeights ? + ov::test::utils::make_fake_quantize( + paramNode, precision, 256ul, constShapes, + { -128.f / 4.f }, { 127.f / 4.f }, { -128.f / 4.f }, { 127.f / 4.f }) : + ov::test::utils::make_fake_quantize( + paramNode, precision, 256ul, constShapes, + { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f }); fakeQuantizeOnAcitvations->set_friendly_name("fakeQuantizeOnAcitvations"); auto weightsConst = std::make_shared(