Skip to content

Commit

Permalink
[CPU] FullyConnected: fixed primitive caching for sparse decompressio…
Browse files Browse the repository at this point in the history
…n case

[CPU][oneDNN] sparsity: some fixes and removed unused code

[CPU][TESTS] FullyConnected: sparsity weights decompression tests

[CPU] FullyConnected: removed min sparse rate = 0.5 limitation

[CPU] fixed property CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE
  • Loading branch information
antonvor committed Jan 3, 2023
1 parent af3c789 commit 92ed322
Show file tree
Hide file tree
Showing 6 changed files with 351 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace intel_cpu {
*/
static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};

static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};

} // namespace intel_cpu
} // namespace ov
10 changes: 2 additions & 8 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
dnnl::memory::desc wgh_candidate;
if (useSparseWeights) {
wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
wdt, memory::desc::packed(nnzCount) };
wdt, memory::desc::packed() };
} else {
wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
wdt, dnnl::memory::format_tag::any };
Expand Down Expand Up @@ -930,18 +930,12 @@ bool FullyConnected::useSparseWeightsDecompression() {
zerosCounts++;
}
}
nnzCount = elementsCount - zerosCounts;

DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ",
zerosCounts, ", nnzCount = ", nnzCount);
zerosCounts, ", nnzCount = ", elementsCount - zerosCounts);

weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount);

// [av] WA: there is no point in using sparse decompression when the sparse rate is low
// todo: add heuristic
if (minSparseRate < 0.5)
minSparseRate = 0.5;

DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ",
minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate);

Expand Down
2 changes: 0 additions & 2 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class FullyConnected : public Node {

void initSupportedPrimitiveDescriptors() override;
void initOptimalPrimitiveDescriptor() override;
// void createPrimitive() override;
std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;

Expand Down Expand Up @@ -112,7 +111,6 @@ class FullyConnected : public Node {

// sparse weights
bool useSparseWeights = false;
int nnzCount = -1;
float minSparseRate = 1.f;
float weiSparseRate = 0.f;
bool useSparseWeightsDecompression();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
// Copyright (C) 2022-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "shared_test_classes/single_layer/mat_mul.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"
#include "ie_precision.hpp"
#include "test_utils/fusing_test_utils.hpp"
#include "ngraph_functions/builders.hpp"
#include <string>
#include <ov_ops/type_relaxed.hpp>
#include "shared_test_classes/base/utils/generate_inputs.hpp"
#include "cpu/cpu_config.hpp"

using namespace ngraph;
using namespace InferenceEngine;
using namespace CPUTestUtils;
using namespace ov::test;

namespace CPULayerTestsDefinitions {

struct ShapeRelatedParams {
std::vector<InputShape> inputShapes;
std::pair<bool, bool> transpose;
};

typedef std::tuple<
ShapeRelatedParams,
ElementType, // Input precision
ElementType, // Weights precision
ElementType, // Output precision
fusingSpecificParams,
CPUSpecificParams,
std::map<std::string, std::string>, // Additional config
float // Weights sparse rate
> MatMulSparseParamSet;

class MatMulSparseCPUTest : public testing::WithParamInterface<MatMulSparseParamSet>,
virtual public SubgraphBaseTest, public CpuTestWithFusing {
public:
static std::string getTestCaseName(const testing::TestParamInfo<MatMulSparseParamSet>& obj) {
ShapeRelatedParams shapeRelatedParams;
ElementType inType, weiType, outType;
fusingSpecificParams fusingParams;
CPUSpecificParams cpuParams;
std::map<std::string, std::string> additionalConfig;
float weiSparseRate;
std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
weiSparseRate) = obj.param;

std::ostringstream result;
result << "IS=";
for (const auto& shape : shapeRelatedParams.inputShapes) {
result << CommonTestUtils::partialShape2str({shape.first}) << "_";
}
result << "TS=";
for (const auto& shape : shapeRelatedParams.inputShapes) {
result << "(";
if (!shape.second.empty()) {
auto itr = shape.second.begin();
do {
result << CommonTestUtils::vec2str(*itr);
} while (++itr != shape.second.end() && result << "_");
}
result << ")_";
}
result << "transpose_a=" << shapeRelatedParams.transpose.first << "_";
result << "transpose_b=" << shapeRelatedParams.transpose.second << "_";
result << "inType=" << inType << "_";
result << "weiType=" << weiType << "_";
result << "outType=" << outType << "_";
result << CpuTestWithFusing::getTestCaseName(fusingParams);
result << CPUTestsBase::getTestCaseName(cpuParams);

if (!additionalConfig.empty()) {
result << "_PluginConf";
for (auto& item : additionalConfig) {
result << "_" << item.first << "=" << item.second;
}
}
result << "_weiSparseRate=" << weiSparseRate;

return result.str();
}

protected:
std::string cpuNodeType;

template<typename T>
void transpose(T& shape) {
IE_ASSERT(shape.size() > 1);
std::swap(*(shape.end() - 1), *(shape.end() - 2));
}

std::vector<int8_t> inline generateSparseVector(size_t vec_len,
float sparseRate = 0.0f,
int8_t upTo = 10,
int8_t startFrom = 1,
int32_t seed = 1) {
std::vector<int8_t> res(vec_len);
std::mt19937 gen(seed);
std::uniform_int_distribution<long> dist(static_cast<long>(startFrom), static_cast<long>(upTo));

std::mt19937 gen_f(123);
std::uniform_real_distribution<float> dist_f(0.f, 1.f);

int countZero = 0;

res[0] = startFrom;
res[vec_len - 1] = upTo;
for (size_t i = 1; i < vec_len - 1; i++) {
if (dist_f(gen_f) > sparseRate) {
res[i] = static_cast<int8_t>(dist(gen));
} else {
res[i] = 0;
countZero++;
}
}

std::cout << "Sparse rate = " << countZero * 100 / vec_len << "%" << std::endl;

return res;
}

std::shared_ptr<Node> makeMatMulRelaxed(const Output<Node>& A,
const ov::PartialShape& inShapeB,
ElementType weiType,
bool transpose_a,
bool transpose_b,
const std::vector<int8_t>& weiData) {
using namespace ngraph;
auto inputParamsFP32 = builder::makeDynamicParams(element::f32, {A.get_partial_shape()});
auto matrixBFP32 = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);

auto matMulRelaxed = std::make_shared<op::TypeRelaxed<opset3::MatMul>>(
*as_type_ptr<opset3::MatMul>(builder::makeMatMul(inputParamsFP32[0], matrixBFP32, transpose_a, transpose_b)),
element::f32);

auto matrixB = ngraph::builder::makeConstant<int8_t>(weiType, inShapeB.get_shape(), weiData);

auto matMul = matMulRelaxed->copy_with_new_inputs({A, matrixB});

return matMul;
}

void SetUp() override {
abs_threshold = 0.5f;
using ngraph::pass::ConvertPrecision;

ShapeRelatedParams shapeRelatedParams;
ElementType inType, weiType, outType;
fusingSpecificParams fusingParams;
CPUSpecificParams cpuParams;
std::map<std::string, std::string> additionalConfig;
float weiSparseRate;

std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
weiSparseRate) = this->GetParam();
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;

configuration.insert(additionalConfig.begin(), additionalConfig.end());

init_input_shapes(shapeRelatedParams.inputShapes);

bool transpA = shapeRelatedParams.transpose.first;
bool transpB = shapeRelatedParams.transpose.second;

if (transpA) {
transpose(inputDynamicShapes[0]);
for (auto& shapes : targetStaticShapes) {
transpose(shapes[0]);
}
}
if (transpB) {
transpose(inputDynamicShapes[1]);
for (auto& shapes : targetStaticShapes) {
transpose(shapes[1]);
}
}

const auto& inShapeA = inputDynamicShapes[0];
const auto& inShapeB = inputDynamicShapes[1];

std::tie(postOpMgrPtr, fusedOps) = fusingParams;

configuration.insert(additionalConfig.begin(), additionalConfig.end());

cpuNodeType = "FullyConnected";
selectedType = makeSelectedTypeStr(selectedType, element::i8);

auto params = builder::makeDynamicParams(inType, {inShapeA});
auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<opset1::Parameter>(params));

auto matrixB = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);

auto weiData = generateSparseVector(ngraph::shape_size(inShapeB.get_shape()), weiSparseRate);
auto matMul = makeMatMulRelaxed(paramOuts[0], inShapeB, weiType, transpA, transpB, weiData);

function = makeNgraphFunction(element::f32, params, matMul, cpuNodeType);

checkFusingPosition = false;

targetDevice = CommonTestUtils::DEVICE_CPU;

functionRefs = ov::clone_model(*function);
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::i8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::u8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
functionRefs->validate_nodes_and_infer_types();
}
};

TEST_P(MatMulSparseCPUTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()

run();
CheckPluginRelatedResults(compiledModel, cpuNodeType);
}

namespace {

/* ============= Common params ============= */

std::vector<CPUSpecificParams> filterSpecificParams(bool sparseExpected) {
std::vector<CPUSpecificParams> specificParams;
if (with_cpu_x86_avx512_core_amx()) {
if (sparseExpected) {
specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx_sparse"});
} else {
specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx"});
}
}

return specificParams;
}

/* ============= FullyConnected ============= */
namespace fullyConnected {

// cpu (sparse) configs
const std::map<std::string, std::string> emptyConfig = {};
const std::map<std::string, std::string> SparseRate50 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.5"}};
const std::map<std::string, std::string> SparseRate80 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.8"}};

const std::vector<ShapeRelatedParams> IS2D_sparse_smoke = {
{static_shapes_to_test_representation({{64, 64}, {64, 64}}), {false, true}},
{static_shapes_to_test_representation({{71, 64}, {64, 64}}), {false, true}},
{static_shapes_to_test_representation({{3, 128}, {128, 64}}), {false, true}},
{static_shapes_to_test_representation({{71, 64}, {64, 128}}), {false, true}},

{
{
{{-1, -1}, {{20, 64}, {20, 64}}},
{{64, 128}, {{64, 128}, {64, 128}}}
},
{false, true}
},

{
{
{{{0, 100}, {0, 64}}, {{20, 64}, {14, 64}, {20, 64}, {14, 64}}},
{{64, 128}, {{64, 128}, {64, 128}, {64, 128}, {64, 128}}}
},
{false, true}
},
};

const auto testParams2D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
::testing::Values(ElementType::i8, ElementType::u8),
::testing::Values(ElementType::i8),
::testing::Values(ElementType::f32),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(filterSpecificParams(false)),
::testing::Values(emptyConfig, SparseRate80),
::testing::Values(0.7));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8, MatMulSparseCPUTest, testParams2D_i8_smoke,
MatMulSparseCPUTest::getTestCaseName);

const auto testParams2D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
::testing::Values(ElementType::i8, ElementType::u8),
::testing::Values(ElementType::i8),
::testing::Values(ElementType::f32),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(filterSpecificParams(true)),
::testing::Values(SparseRate50),
::testing::Values(0.7));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8_sparse, MatMulSparseCPUTest, testParams2D_i8_sparse_smoke,
MatMulSparseCPUTest::getTestCaseName);

const std::vector<ShapeRelatedParams> IS3D_sparse_smoke = {
{static_shapes_to_test_representation({{1, 64, 64}, {64, 64}}), {false, true}},
{static_shapes_to_test_representation({{3, 71, 64}, {64, 64}}), {false, true}},
{static_shapes_to_test_representation({{3, 5, 128}, {128, 64}}), {false, true}},
{static_shapes_to_test_representation({{1, 71, 64}, {64, 128}}), {false, true}},

{
{
{{-1, -1, 64}, {{1, 5, 64}, {1, 10, 64}, {1, 5, 64}, {1, 10, 64}}},
{{64, 128}, {{64, 128}, {64, 128}}}
},
{false, true}
},

// todo: [av] investigate "Primitive descriptor was not found" error for this case
// {
// {
// {{{0, 60}, {0, 60}, {0, 64}}}, {{1, 3, 64}, {1, 7, 64}}},
// {{64, 64}, {{64, 64}, {64, 64}}}
// },
// {false, true}
// },
};

const auto testParams3D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
::testing::Values(ElementType::i8, ElementType::u8),
::testing::Values(ElementType::i8),
::testing::Values(ElementType::f32),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(filterSpecificParams(false)),
::testing::Values(emptyConfig, SparseRate80),
::testing::Values(0.7));

INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8, MatMulSparseCPUTest, testParams3D_i8_smoke,
MatMulSparseCPUTest::getTestCaseName);

const auto testParams3D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
::testing::Values(ElementType::i8, ElementType::u8),
::testing::Values(ElementType::i8),
::testing::Values(ElementType::f32),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(filterSpecificParams(true)),
::testing::Values(SparseRate50),
::testing::Values(0.7));

INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8_sparse, MatMulSparseCPUTest, testParams3D_i8_sparse_smoke,
MatMulSparseCPUTest::getTestCaseName);

} // namespace fullyConnected

} // namespace

} // namespace CPULayerTestsDefinitions
Loading

0 comments on commit 92ed322

Please sign in to comment.