[CPU] [ARM] FullyConnected: int8 support

openvinotoolkit · Jun 26, 2024 · c2d4099 · c2d4099
1 parent 3a13983
commit c2d4099
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 22 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp
@@ -3,15 +3,17 @@
 //
 
 #include "acl_executor.hpp"
-#include "acl_utils.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
 
-ACLMemoryInfo ACLCommonExecutor::initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs) {
-    auto acl_tensor_type   = precisionToAclDataType(memoryPtr->getPrecision());
+ACLMemoryInfo ACLCommonExecutor::initTensorInfo(
+        const MemoryPtr& memoryPtr,
+        const ACLTensorAttrs attrs,
+        const QuantizedDataType quantized) {
+    auto acl_tensor_type = precisionToAclDataType(memoryPtr->getPrecision(), quantized);
     auto acl_tensor_layout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr());
 
     ACLMemoryInfo aclMemoryInfo = nullptr;
@@ -40,8 +42,25 @@ ACLMemory ACLCommonExecutor::initTensor(const ACLMemoryInfo& aclMemoryInfo) {
 
 bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     for (auto& cpu_mem_ptr : memory) {
+        auto aclPrecision = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
+        QuantizedDataType quantized;
+        switch (aclPrecision) {
+            case arm_compute::DataType::S8: {
+                quantized = QuantizedDataType::QASYMM;
+                break;
+            }
+            case arm_compute::DataType::U8: {
+                quantized = QuantizedDataType::QSYMM;
+                break;
+            }
+            default: {
+                quantized = QuantizedDataType::NONE;
+                break;
+            }
+        }
+
         // Initialize arm_compute::TensorInfo object
-        auto aclTensorInfo = initTensorInfo(cpu_mem_ptr.second, aclTensorAttrs);
+        auto aclTensorInfo = initTensorInfo(cpu_mem_ptr.second, aclTensorAttrs, quantized);
         // Initialize arm_compute::Tensor object
         aclMemoryMap[cpu_mem_ptr.first] = initTensor(aclTensorInfo);
     }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "acl_utils.hpp"
 #include "cpu_memory.h"
 #include "nodes/executors/executor.hpp"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
@@ -42,7 +43,10 @@ class ACLCommonExecutor : public Executor {
 private:
     ACLMemoryMap aclMemoryMap;
     ACLFunction iFunction = nullptr;
-    static ACLMemoryInfo initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs);
+    static ACLMemoryInfo initTensorInfo(
+            const MemoryPtr& memoryPtr,
+            const ACLTensorAttrs attrs,
+            const QuantizedDataType quantized);
     static ACLMemory initTensor(const ACLMemoryInfo& aclMemoryInfo);
 };
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -35,7 +35,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const
 }
 
 bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
-    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8), UNSUPPORTED_SRC_PRECISIONS);
     VERIFY(postOpsNumbers(config) < 2,          UNSUPPORTED_NUMBER_OF_POSTOPS);
     VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
     VERIFY(one_of(weiRank(config), 2U, 3U),     UNSUPPORTED_SRC_RANK);

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -96,15 +96,50 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis
     }
 }
 
+enum class QuantizedDataType {
+    NONE,   // not quantized
+    QSYMM,  // quantized, symmetric
+    QASYMM  // quantized, asymmetric
+};
+
 /**
 * @brief Return ComputeLibrary DataType that corresponds to the given precision
 * @param precision precision to be converted
 * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
 */
-inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) {
+inline arm_compute::DataType precisionToAclDataType(
+        const ov::element::Type& precision,
+        const QuantizedDataType quantized = QuantizedDataType::NONE) {
     switch (precision) {
-        case ov::element::i8:    return arm_compute::DataType::S8;
-        case ov::element::u8:    return arm_compute::DataType::U8;
+        case ov::element::i8: {
+            switch (quantized) {
+                case QuantizedDataType::QASYMM: {
+                    return arm_compute::DataType::QASYMM8_SIGNED;
+                }
+                case QuantizedDataType::NONE: {
+                    return arm_compute::DataType::S8;
+                }
+                default: {
+                    return arm_compute::DataType::UNKNOWN;
+                }
+            }
+        }
+        case ov::element::u8: {
+            switch (quantized) {
+                case QuantizedDataType::QSYMM: {
+                    return arm_compute::DataType::QSYMM8;
+                }
+                case QuantizedDataType::QASYMM: {
+                    return arm_compute::DataType::QASYMM8;
+                }
+                case QuantizedDataType::NONE: {
+                    return arm_compute::DataType::U8;
+                }
+                default: {
+                    return arm_compute::DataType::UNKNOWN;
+                }
+            }
+        }
         case ov::element::i16:   return arm_compute::DataType::S16;
         case ov::element::u16:   return arm_compute::DataType::U16;
         case ov::element::i32:   return arm_compute::DataType::S32;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -78,6 +78,7 @@ static const TypeMapping dnnlFCTypeMapping {
 static const TypeMapping aclFCTypeMapping {
     // {src, wei, bia, dst}              pt<src, wei, bias, dst>
     {{_f32 | _f16, _any, _any, _any}, pt(bypass(), use<0>(), use<0>(), use<0>())},
+    {{_i8, _i8, _any, _any},          pt(just<i8>(), just<i8>(), bypass(), just<i32>())},
     {{_any, _any, _any, _any},        pt(just<f32>(), just<f32>(), just<f32>(), just<f32>())}
 };
 

diff --git a/...l/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/...l/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
@@ -44,6 +44,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_LPT, FullyConnectedTransformation,
         ::testing::ValuesIn(netPrecisions),
         ::testing::ValuesIn(shapes),
         ::testing::Values(ov::test::utils::DEVICE_CPU),
-        ::testing::ValuesIn(trasformationParamValues)),
+        ::testing::ValuesIn(trasformationParamValues),
+        ::testing::ValuesIn({ov::element::i8, ov::element::u8})),
     FullyConnectedTransformation::getTestCaseName);
 }  // namespace
diff --git a/...al/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp b/...al/plugin/shared/include/low_precision_transformations/fully_connected_transformation.hpp
@@ -20,7 +20,8 @@ typedef std::tuple<
     ov::element::Type,
     MatMulShapes,
     std::string,
-    ov::pass::low_precision::LayerTransformation::Params> FullyConnectedTransformationParams;
+    ov::pass::low_precision::LayerTransformation::Params,
+    ov::element::Type> FullyConnectedTransformationParams;
 
 namespace LayerTestsDefinitions {
 

diff --git a/...tional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp b/...tional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
@@ -20,14 +20,16 @@ std::string FullyConnectedTransformation::getTestCaseName(const testing::TestPar
     MatMulShapes shapes;
     std::string targetDevice;
     ov::pass::low_precision::LayerTransformation::Params params;
-    std::tie(precision, shapes, targetDevice, params) = obj.param;
+    ov::element::Type weightsType;
+    std::tie(precision, shapes, targetDevice, params, weightsType) = obj.param;
 
     std::ostringstream result;
     result <<
-           get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) <<
-           shapes.inputB << "_" <<
+        get_test_case_name_by_params(precision, shapes.inputA, targetDevice, params) <<
+        shapes.inputB << "_" <<
         shapes.transposeA << "_" <<
-        shapes.transposeB;
+        shapes.transposeB << "_" <<
+        weightsType;
 
     return result.str();
 }
@@ -36,7 +38,8 @@ void FullyConnectedTransformation::SetUp() {
     ov::element::Type precision;
     MatMulShapes shapes;
     ov::pass::low_precision::LayerTransformation::Params params;
-    std::tie(precision, shapes, targetDevice, params) = this->GetParam();
+    ov::element::Type weightsType;
+    std::tie(precision, shapes, targetDevice, params, weightsType) = this->GetParam();
 
     init_input_shapes({ shapes.inputA, shapes.inputB });
 
@@ -45,12 +48,17 @@ void FullyConnectedTransformation::SetUp() {
         shapes.inputA,
         shapes.inputB,
         shapes.transposeA,
-        shapes.transposeB);
+        shapes.transposeB,
+        weightsType == ov::element::i8);
 }
 
 TEST_P(FullyConnectedTransformation, CompareWithRefImpl) {
     SKIP_IF_CURRENT_TEST_IS_DISABLED();
     run();
+
+    const auto actualPrecision = get_runtime_precision_by_type("FullyConnected");
+    const auto weightsType = std::get<4>(GetParam());
+    EXPECT_EQ(actualPrecision, weightsType.to_string());
 };
 
 }  // namespace LayerTestsDefinitions
diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp
@@ -27,7 +27,8 @@ class MatMulFunction {
         const ov::PartialShape inputShape1,
         const ov::PartialShape inputShape2,
         const bool transpose1,
-        const bool transpose2);
+        const bool transpose2,
+        const bool signedOnWeights = false);
 
     static std::shared_ptr<ov::Model> getOriginal(
         const ov::element::Type precision,

diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
@@ -54,12 +54,17 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
     const ov::PartialShape inputShape1,
     const ov::PartialShape inputShape2,
     const bool transpose1,
-    const bool transpose2) {
+    const bool transpose2,
+    const bool signedOnWeights) {
     const auto paramNode = std::make_shared<ov::opset1::Parameter>(precision, inputShape1);
     const std::vector<size_t> constShapes(inputShape1.rank().get_length(), 1ul);
-    const auto fakeQuantizeOnAcitvations = ov::test::utils::make_fake_quantize(
-        paramNode, precision, 256ul, constShapes,
-        { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
+    const auto fakeQuantizeOnAcitvations = signedOnWeights ?
+            ov::test::utils::make_fake_quantize(
+                paramNode, precision, 256ul, constShapes,
+                { -128.f / 4.f }, { 127.f / 4.f }, { -128.f / 4.f }, { 127.f / 4.f }) :
+            ov::test::utils::make_fake_quantize(
+                paramNode, precision, 256ul, constShapes,
+                { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
     fakeQuantizeOnAcitvations->set_friendly_name("fakeQuantizeOnAcitvations");
 
     auto weightsConst = std::make_shared<ov::op::v0::Constant>(