diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 1cef0551d1eb08..1ca992bfc0f0cd 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -36,6 +36,9 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { return 2; case dnnl::memory::data_type::undef: return 0; + // todo: + case dnnl::memory::data_type::f64: + return 8; default: IE_THROW() << "Unsupported data type."; } @@ -60,6 +63,8 @@ memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngin return memory::data_type::f16; case InferenceEngine::Precision::UNSPECIFIED: return memory::data_type::undef; + case InferenceEngine::Precision::FP64: + return memory::data_type::f64; default: { IE_THROW() << "The plugin does not support " << prec.name(); } @@ -84,6 +89,9 @@ InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::dat return InferenceEngine::Precision::FP16; case memory::data_type::undef: return InferenceEngine::Precision::UNSPECIFIED; + // todo: needed for graph dumping, remove later + case memory::data_type::f64: + return InferenceEngine::Precision::FP64; default: { IE_THROW() << "Unsupported data type."; } diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index ed18c01c848af1..c356150694caab 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -415,9 +415,9 @@ void Graph::InitDescriptors() { DEBUG_LOG("Init supported primitive descriptors for node: ", node->getName()); node->initSupportedPrimitiveDescriptors(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.filterSupportedPrimitiveDescriptors); - DEBUG_LOG("Filter supported primitive descriptors for node: ", node->getName()); - node->filterSupportedPrimitiveDescriptors(); + // OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.filterSupportedPrimitiveDescriptors); + // DEBUG_LOG("Filter supported primitive descriptors for node: ", node->getName()); + // node->filterSupportedPrimitiveDescriptors(); #ifdef CPU_DEBUG_CAPS const auto& SPDs = node->getSupportedPrimitiveDescriptors(); @@ -599,6 +599,11 @@ void Graph::InitEdges() { auto reorderStatus = graphEdges[i]->needReorder(); DEBUG_LOG(graphEdges[i]->name(), " reorderStatus = ", reorderStatus); if (reorderStatus == Edge::ReorderStatus::Regular) { + if (edge->getParent()->getName() == "TRANSPOSE_1") { + insertReorder(edge, false); + updateEdge(i); + continue; + } Edge::ReorderStatus reorderStatusInternal = Edge::ReorderStatus::Regular; // Check if there is a reorder that needs the precision conversion if (edge->getInputDesc().getPrecision() != edge->getOutputDesc().getPrecision() && diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 791ff04021737c..5d5ef16385f269 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -99,6 +99,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseFCAndTransposeOnWeights(graph); graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveConvert"); + RemoveConvert(graph); + graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseDeconvolutionAndSimpleOperation"); FuseDeconvolutionAndSimpleOperation(graph); graph.RemoveDroppedNodes(); @@ -193,8 +197,8 @@ void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph &graph) { DropDoubleReorders(graph); graph.RemoveDroppedNodes(); - MergeTransposeAndReorder(graph); - graph.RemoveDroppedNodes(); + // MergeTransposeAndReorder(graph); + // graph.RemoveDroppedNodes(); graph.RemoveDroppedEdges(); } @@ -848,6 +852,23 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { } } +void GraphOptimizer::RemoveConvert(Graph& graph) { + auto& graphNodes = graph.GetNodes(); + + auto isSuitablePattern = [](NodePtr parent) { + bool res = true && parent->getType() == Type::Convert + && parent->getParentEdgeAt(0)->getParent()->getName() == "TRANSPOSE_1"; + return res; + }; + + for (auto parent : graphNodes) { + if (isSuitablePattern(parent)) { + CPU_GRAPH_OPTIMIZER_SCOPE(RemoveConvert); + graph.DropNode(parent); + } + } +} + void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node auto& graphNodes = graph.GetNodes(); diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index bb6494758f38be..f844679b1ca2b5 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -27,6 +27,7 @@ class GraphOptimizer { void FuseMultiplyAndAdd(Graph &graph); void MergeConvertAndScaleShift(Graph& graph); void FuseFCAndConvertOnWeights(Graph& graph); + void RemoveConvert(Graph &graph); void FuseFCAndTransposeOnWeights(Graph& graph); void FuseFullyConnectedAndSimpleOperation(Graph &graph); void FuseMatMulAndSimpleOperation(Graph &graph); diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 85b58413273693..b30ee69d8ab1d4 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -43,6 +43,9 @@ void Reorder::getSupportedDescriptors() { IE_THROW() << "Incorrect number of input edges for layer " << getName(); if (getChildEdges().empty()) IE_THROW() << "Incorrect number of output edges for layer " << getName(); + if (getParentEdgeAt(0)->getParent()->getName() == "TRANSPOSE_1") { + testReorder = true; + } } void Reorder::initSupportedPrimitiveDescriptors() { @@ -252,6 +255,9 @@ void Reorder::createReorderPrimitive(const dnnl::memory::desc& srcDesc, void* srcPtr, const dnnl::memory::desc& dstDesc, void* dstPtr) { + if (testReorder) { + return; + } auto selectedPD = getSelectedPrimitiveDescriptor(); if (!selectedPD) IE_THROW() << "Preferable primitive descriptor is not set."; @@ -393,6 +399,175 @@ void Reorder::optimizedNspc2Ncsp() { } void Reorder::execute(dnnl::stream strm) { + if (false && testReorder) { + auto parentEdge = getParentEdgeAt(0); + auto childEdge = getChildEdgeAt(0); + + // parent: + auto parentBlockedDesc = parentEdge->getMemoryPtr()->getDescWithType(); + auto parentDataType = parentEdge->getMemoryPtr()->getDataType(); + // child: + auto childBlockedDesc = childEdge->getMemoryPtr()->getDescWithType(); + auto childDataType = childEdge->getMemoryPtr()->getDataType(); + + // auto cpuParentBlockedDesc = parentEdge->getMemoryPtr()->getDescWithType(); + // auto cpuChildBlockedDesc = childEdge->getMemoryPtr()->getDescWithType(); + + // blockedDims == 5 (limit) + // c1 = 1, c2 = 1 + // c1 = 2, c2 = 1 + // c1 = 2, c2 = 2 (16 - 16, 8 - 8, 16 - 8) + // do conversion ? + + // main + + // tail (if output is blocked) + // only planar, 5d maximum + size_t blockSize = 1; + size_t maxDims = 1; + + auto parentBlockedDims = parentBlockedDesc->getBlockDims(); + auto parentStrides = parentBlockedDesc->getStrides(); + auto parentOrder = parentBlockedDesc->getOrder(); + + auto childBlockedDims = childBlockedDesc->getBlockDims(); + auto childStrides = childBlockedDesc->getStrides(); + auto childOrder = childBlockedDesc->getOrder(); + + size_t ndims = parentBlockedDesc->getBlockDims().size(); // 4 + + std::vector blockedDims = childBlockedDims; + std::vector dstOffsets = childStrides; + std::vector srcOffsets(ndims); + + // blocked + // change to VectorDims? + auto isBlocked = [](std::vector& order) { + // todo: add more checks + size_t count = std::count(order.begin(), order.end(), 1); + return count > 1; + }; + + bool isSrcBlocked = isBlocked(parentOrder); + bool isDstBlocked = isBlocked(childOrder); + bool areBothBlocked = isSrcBlocked && isDstBlocked; + + PermuteParams permuteParams; + permuteParams.src_block_dims = parentBlockedDims; + permuteParams.dst_block_dims = childBlockedDims; + permuteParams.src_block_order = parentOrder; + permuteParams.dst_block_order = childOrder; + permuteParams.order = {0, 1, 2, 3}; // todo: + permuteParams.data_size = 1; // todo: + + MemoryCPtr srcMemory = parentEdge->getMemoryPtr(); + MemoryPtr dstMemory = childEdge->getMemoryPtr(); + + auto srcData = reinterpret_cast(parentEdge->getMemoryPtr()->getData()); + auto dstData = reinterpret_cast(childEdge->getMemoryPtr()->getData()); + + auto getOffset4 = [](std::vector &offsets, size_t i0, size_t i1, size_t i2, size_t i3) { + return offsets[0] * i0 + offsets[1] * i1 + offsets[2] * i2 + offsets[3] * i3; + }; + + auto getOffset5 = [](std::vector &offsets, size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + return offsets[0] * i0 + offsets[1] * i1 + offsets[2] * i2 + offsets[3] * i3 + offsets[4] * i4; + }; + + if (areBothBlocked) { + // todo: + auto transposeParams = TransposeExecutor::prepareParams(permuteParams); + const auto &dst_block_dims = transposeParams.dst_block_dims; + std::vector src_strides = transposeParams.src_strides; + std::vector dst_strides = transposeParams.dst_strides; + + for (size_t i0 = 0; i0 < dst_block_dims[0]; i0++) { + for (size_t i1 = 0; i1 < dst_block_dims[1]; i1++) { + for (size_t i2 = 0; i2 < dst_block_dims[2]; i2++) { + for (size_t i3 = 0; i3 < dst_block_dims[3]; i3++) { + for (size_t i4 = 0; i4 < dst_block_dims[4]; i4++) { + auto srcOff = getOffset5(src_strides, i0, i1, i2, i3, i4); + auto dstOff = getOffset5(dst_strides, i0, i1, i2, i3, i4); + ov::intel_cpu::cpu_convert( + srcData + srcOff * parentBlockedDesc->getPrecision().size(), + dstData + dstOff * childBlockedDesc->getPrecision().size(), + parentBlockedDesc->getPrecision(), + childBlockedDesc->getPrecision(), + 1); + } + } + } + } + } + } else if (isSrcBlocked || isDstBlocked) { + auto transposeParams = TransposeExecutor::prepareParams(permuteParams); + const auto &dst_block_dims = transposeParams.dst_block_dims; + std::vector src_strides = transposeParams.src_strides; + std::vector dst_strides = transposeParams.dst_strides; + + for (size_t i0 = 0; i0 < dst_block_dims[0]; i0++) { + for (size_t i1 = 0; i1 < dst_block_dims[1]; i1++) { + for (size_t i2 = 0; i2 < dst_block_dims[2]; i2++) { + for (size_t i3 = 0; i3 < dst_block_dims[3]; i3++) { + for (size_t i4 = 0; i4 < dst_block_dims[4]; i4++) { + auto srcOff = getOffset5(src_strides, i0, i1, i2, i3, i4); + auto dstOff = getOffset5(dst_strides, i0, i1, i2, i3, i4); + ov::intel_cpu::cpu_convert( + srcData + srcOff * parentBlockedDesc->getPrecision().size(), + dstData + dstOff * childBlockedDesc->getPrecision().size(), + parentBlockedDesc->getPrecision(), + childBlockedDesc->getPrecision(), + 1); + } + } + } + } + } + } else { + // todo: can we use TransposeExecutor::prepareParams(permuteParams)? I think yes + // prepare srcOffsets + for (size_t i = 0; i < ndims; i++) { + srcOffsets[i] = parentStrides[std::distance(parentOrder.begin(), std::find(parentOrder.begin(), parentOrder.end(), childOrder[i]))]; + } + + for (size_t i0 = 0; i0 < childBlockedDims[0]; i0++) { + for (size_t i1 = 0; i1 < childBlockedDims[1]; i1++) { + for (size_t i2 = 0; i2 < childBlockedDims[2]; i2++) { + for (size_t i3 = 0; i3 < childBlockedDims[3]; i3++) { + auto srcOff = getOffset4(srcOffsets, i0, i1, i2, i3); + auto dstOff = getOffset4(dstOffsets, i0, i1, i2, i3); + ov::intel_cpu::cpu_convert( + srcData + srcOff * parentBlockedDesc->getPrecision().size(), + dstData + dstOff * childBlockedDesc->getPrecision().size(), + parentBlockedDesc->getPrecision(), + childBlockedDesc->getPrecision(), + 1); + } + } + } + } + } + + // auto srcData = reinterpret_cast(parentEdge->getMemoryPtr()->getData()); // todo: + // auto dstData = reinterpret_cast(childEdge->getMemoryPtr()->getData()); // todo: + + // get offsets + // change to VectorDims? + // generic 5D + 3 blocks + } else if (false && testReorder) { + execPtr = std::make_shared(); + auto srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); + auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + execPtr->exec({srcMemPtr}, {dstMemPtr}); + return; + // generic + } else if (true && testReorder) { + execPtr = std::make_shared(); + auto srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); + auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + execPtr->exec({srcMemPtr}, {dstMemPtr}); + return; + } #if defined(OV_CPU_ARM_ENABLE_FP16) if (transposeExecutor) { auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); @@ -493,6 +668,277 @@ void Reorder::reorderData(const IMemory &input, const IMemory &output, MultiCach } } +void Blocked5Dx3ReorderExecutor::exec(const std::vector& src, const std::vector& dst) { + constexpr size_t maxShapeRank = 5; + constexpr size_t maxInnerBlocks = 3; + + // auto shape = getInputShapeAtPort(0); + // if (shape.getRank() > maxShapeRank) { + // IE_THROW() << "Reorder node with name " << getName() << " doesn't support shape rank " << shape.getRank() + // << " (maxRank = " << maxShapeRank << ")"; + // } + + auto staticDims = src[0]->getStaticDims(); + auto ndims = staticDims.size(); + + // parent: + auto parentBlockedDesc = src[0]->getDescWithType(); + auto parentDataType = src[0]->getDataType(); + auto parentBlockedDims = parentBlockedDesc->getBlockDims(); + auto parentStrides = parentBlockedDesc->getStrides(); + auto parentOrder = parentBlockedDesc->getOrder(); + // child: + auto childBlockedDesc = dst[0]->getDescWithType(); + auto childDataType = dst[0]->getDataType(); + auto childBlockedDims = childBlockedDesc->getBlockDims(); + auto childStrides = childBlockedDesc->getStrides(); + auto childOrder = childBlockedDesc->getOrder(); + + // todo: choose better name + using ByOrderStruct = std::vector>; + + auto getEmptyStruct = [&](size_t initValue) { + ByOrderStruct s(maxShapeRank); + for (size_t i = 0; i < maxShapeRank; i++) { + s[i] = std::vector(maxInnerBlocks, initValue); + } + return s; + }; + + ByOrderStruct srcDims = getEmptyStruct(1); + ByOrderStruct dstDims = getEmptyStruct(1); + ByOrderStruct srcOffsets = getEmptyStruct(0); + ByOrderStruct dstOffsets = getEmptyStruct(0); + + auto prepareStructures = [&](const std::shared_ptr &desc, ByOrderStruct& dims, ByOrderStruct& offsets) { + auto blockDims = desc->getBlockDims(); + auto strides = desc->getStrides(); + auto order = desc->getOrder(); + + for (size_t i = 0; i < staticDims.size(); i++) { + auto cnt = std::count(order.begin(), order.end(), i); + // if (cnt > maxInnerBlocks) { + // IE_THROW() << "Reorder node with name " << getName() << " doesn't support this blocked descriptor"; + // } + size_t tmpIndex = maxInnerBlocks - cnt; + for (size_t j = 0; j < order.size(); j++) { + if (order[j] == i) { + dims[maxShapeRank - staticDims.size() + i][tmpIndex] = blockDims[j]; + offsets[maxShapeRank - staticDims.size() + i][tmpIndex] = strides[j]; + tmpIndex++; + } + } + } + }; + + prepareStructures(parentBlockedDesc, srcDims, srcOffsets); + prepareStructures(childBlockedDesc, dstDims, dstOffsets); + + size_t DIM0 = ndims < 5 ? 1 : staticDims[ndims - 5]; + size_t DIM1 = ndims < 4 ? 1 : staticDims[ndims - 4]; + size_t DIM2 = ndims < 3 ? 1 : staticDims[ndims - 3]; + size_t DIM3 = ndims < 2 ? 1 : staticDims[ndims - 2]; + size_t DIM4 = ndims < 1 ? 1 : staticDims[ndims - 1]; + + auto init = [](std::vector& curDims, size_t& offset) { + for (size_t i = 0; i < maxInnerBlocks; i++) { + curDims[i] = 0; + } + offset = 0; + }; + + auto inc = [](std::vector& curDims, const std::vector& dims, + size_t& offset, const std::vector& strides) { + offset += strides[2]; + if (++curDims[2] == dims[2]) { + curDims[2] = 0; + offset += strides[1] - strides[2] * dims[2]; + if (++curDims[1] == dims[1]) { + curDims[1] = 0; + offset += strides[0] - strides[1] * dims[1]; + } + } + }; + + auto inc2 = [](std::vector& curDims, const std::vector& dims, + size_t& offset, const std::vector& strides) { + offset += strides[dims.size() - 1]; + for (size_t i = dims.size() - 1; i > 0; i--) { + if (++curDims[i] == dims[i]) { + curDims[i] = 0; + offset += strides[i - 1] - strides[i] * dims[i]; + } else { + break; + } + } + }; + + auto srcData = reinterpret_cast(src[0]->getData()); + auto dstData = reinterpret_cast(dst[0]->getData()); + + ByOrderStruct curSrcDims = getEmptyStruct(0); + ByOrderStruct curDstDims = getEmptyStruct(0); + std::vector curSrcOffsets(maxShapeRank); + std::vector curDstOffsets(maxShapeRank); + init(curSrcDims[0], curSrcOffsets[0]); + init(curDstDims[0], curDstOffsets[0]); + for (size_t i0 = 0; i0 < DIM0; i0++) { + init(curSrcDims[1], curSrcOffsets[1]); + init(curDstDims[1], curDstOffsets[1]); + for (size_t i1 = 0; i1 < DIM1; i1++) { + init(curSrcDims[2], curSrcOffsets[2]); + init(curDstDims[2], curDstOffsets[2]); + for (size_t i2 = 0; i2 < DIM2; i2++) { + init(curSrcDims[3], curSrcOffsets[3]); + init(curDstDims[3], curDstOffsets[3]); + for (size_t i3 = 0; i3 < DIM3; i3++) { + init(curSrcDims[4], curSrcOffsets[4]); + init(curDstDims[4], curDstOffsets[4]); + for (size_t i4 = 0; i4 < DIM4; i4++) { + auto srcOff = (curSrcOffsets[0] + curSrcOffsets[1] + curSrcOffsets[2] + curSrcOffsets[3] + curSrcOffsets[4]); + auto dstOff = (curDstOffsets[0] + curDstOffsets[1] + curDstOffsets[2] + curDstOffsets[3] + curDstOffsets[4]); + + ov::intel_cpu::cpu_convert( + srcData + srcOff * parentBlockedDesc->getPrecision().size(), + dstData + dstOff * childBlockedDesc->getPrecision().size(), + parentBlockedDesc->getPrecision(), + childBlockedDesc->getPrecision(), + 1); + + inc(curSrcDims[4], srcDims[4], curSrcOffsets[4], srcOffsets[4]); + inc(curDstDims[4], dstDims[4], curDstOffsets[4], dstOffsets[4]); + } + inc(curSrcDims[3], srcDims[3], curSrcOffsets[3], srcOffsets[3]); + inc(curDstDims[3], dstDims[3], curDstOffsets[3], dstOffsets[3]); + } + inc(curSrcDims[2], srcDims[2], curSrcOffsets[2], srcOffsets[2]); + inc(curDstDims[2], dstDims[2], curDstOffsets[2], dstOffsets[2]); + } + inc(curSrcDims[1], srcDims[1], curSrcOffsets[1], srcOffsets[1]); + inc(curDstDims[1], dstDims[1], curDstOffsets[1], dstOffsets[1]); + } + inc(curSrcDims[0], srcDims[0], curSrcOffsets[0], srcOffsets[0]); + inc(curDstDims[0], dstDims[0], curDstOffsets[0], dstOffsets[0]); + } +} + +void GenericReorderExecutor::exec(const std::vector& src, const std::vector& dst) { + auto staticDims = src[0]->getStaticDims(); + auto ndims = staticDims.size(); + + // parent: + auto parentBlockedDesc = src[0]->getDescWithType(); + auto parentDataType = src[0]->getDataType(); + auto parentBlockedDims = parentBlockedDesc->getBlockDims(); + auto parentStrides = parentBlockedDesc->getStrides(); + auto parentOrder = parentBlockedDesc->getOrder(); + // child: + auto childBlockedDesc = dst[0]->getDescWithType(); + auto childDataType = dst[0]->getDataType(); + auto childBlockedDims = childBlockedDesc->getBlockDims(); + auto childStrides = childBlockedDesc->getStrides(); + auto childOrder = childBlockedDesc->getOrder(); + + // todo: choose better name + using ByOrderStruct = std::vector>; + + ByOrderStruct srcDims(ndims); + ByOrderStruct dstDims(ndims); + ByOrderStruct srcOffsets(ndims); + ByOrderStruct dstOffsets(ndims); + + auto prepareStructures = [&](const std::shared_ptr &desc, ByOrderStruct& dims, ByOrderStruct& offsets) { + auto blockDims = desc->getBlockDims(); + auto strides = desc->getStrides(); + auto order = desc->getOrder(); + + for (size_t i = 0; i < staticDims.size(); i++) { + auto cnt = std::count(order.begin(), order.end(), i); + dims[i].resize(cnt); + offsets[i].resize(cnt); + + size_t tmpIndex = 0; + for (size_t j = 0; j < order.size(); j++) { + if (order[j] == i) { + dims[i][tmpIndex] = blockDims[j]; + offsets[i][tmpIndex] = strides[j]; + tmpIndex++; + } + } + } + }; + + prepareStructures(parentBlockedDesc, srcDims, srcOffsets); + prepareStructures(childBlockedDesc, dstDims, dstOffsets); + + auto init = [](std::vector& curDims, size_t& offset) { + for (size_t i = 0; i < curDims.size(); i++) { + curDims[i] = 0; + } + offset = 0; + }; + + auto srcData = reinterpret_cast(src[0]->getData()); + auto dstData = reinterpret_cast(dst[0]->getData()); + + auto initDims = [](ByOrderStruct &startDims, const ByOrderStruct &dims) { + for (int i = 0; i < dims.size(); i++) { + startDims[i].resize(dims[i].size()); + } + }; + + ByOrderStruct curSrcDims(ndims); + ByOrderStruct curDstDims(ndims); + std::vector curSrcOffsets(ndims); + std::vector curDstOffsets(ndims); + initDims(curSrcDims, srcDims); + initDims(curDstDims, dstDims); + + size_t staticDimsCount = std::accumulate(staticDims.begin(), staticDims.end(), 1, std::multiplies()); + + auto inc2 = [](std::vector& curDims, const std::vector& dims, + size_t& offset, const std::vector& strides) { + offset += strides[dims.size() - 1]; + for (int i = dims.size() - 1; i > 0; i--) { + if (++curDims[i] == dims[i]) { + curDims[i] = 0; + offset += strides[i - 1] - strides[i] * dims[i]; + } else { + break; + } + } + }; + + auto inc3 = [&](std::vector& curDims, const std::vector& dims) { + for (int i = dims.size() - 1; i >= 0; i--) { + inc2(curSrcDims[i], srcDims[i], curSrcOffsets[i], srcOffsets[i]); + inc2(curDstDims[i], dstDims[i], curDstOffsets[i], dstOffsets[i]); + if (++curDims[i] == dims[i]) { + init(curSrcDims[i], curSrcOffsets[i]); + init(curDstDims[i], curDstOffsets[i]); + curDims[i] = 0; + } else { + break; + } + } + }; + + std::vector curDims(staticDims.size()); + for (size_t i = 0; i < staticDimsCount; i++) { + auto srcOff = std::accumulate(curSrcOffsets.begin(), curSrcOffsets.end(), 0); + auto dstOff = std::accumulate(curDstOffsets.begin(), curDstOffsets.end(), 0); + + ov::intel_cpu::cpu_convert( + srcData + srcOff * parentBlockedDesc->getPrecision().size(), + dstData + dstOff * childBlockedDesc->getPrecision().size(), + parentBlockedDesc->getPrecision(), + childBlockedDesc->getPrecision(), + 1); + + inc3(curDims, staticDims); + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index 5e6a125ca94346..6d94433da17645 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -16,6 +16,88 @@ namespace ov { namespace intel_cpu { namespace node { +class ReorderExecutor { +public: + // static jit_permute_config_params prepareParams(const PermuteParams& params); + explicit ReorderExecutor() {} // todo: + // explicit ReorderExecutor(const ExecutorContext::CPtr context); // todo: + // virtual bool init(const TransposeParams& transposeParams, + // const std::vector& srcDescs, + // const std::vector& dstDescs, + // const dnnl::primitive_attr &attr) = 0; + virtual void exec(const std::vector& src, const std::vector& dst) = 0; // todo: + // virtual impl_desc_type getImplType() const = 0; + virtual ~ReorderExecutor() = default; +protected: + // PermuteParams permuteParams; + const ExecutorContext::CPtr context; // todo: +}; +using ReorderExecutorPtr = std::shared_ptr; +using ReorderExecutorCPtr = std::shared_ptr; + +// class ReorderExecutorBuilder { +// public: +// virtual ~ReorderExecutorBuilder() = default; +// // virtual bool isSupported(const TransposeParams& transposeParams, +// // const std::vector& srcDescs, +// // const std::vector& dstDescs) const = 0; +// virtual ReorderExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; +// }; + +// using ReorderExecutorBuilderPtr = std::shared_ptr; +// using ReorderExecutorBuilderCPtr = std::shared_ptr; + +class Blocked5Dx3ReorderExecutor : public ReorderExecutor { +public: + using ReorderExecutor::ReorderExecutor; + + // bool init(const TransposeParams &transposeParams, + // const std::vector &srcDescs, + // const std::vector &dstDescs, + // const dnnl::primitive_attr &attr) override; + void exec(const std::vector &src, const std::vector &dst) override; + // impl_desc_type getImplType() const override { return implType; } +private: + // static const impl_desc_type implType = impl_desc_type::ref; +}; + +// class GenericReorderExecutorBuilder : public ReorderExecutorBuilder { +// public: +// // bool isSupported(const TransposeParams& transposeParams, +// // const std::vector& srcDescs, +// // const std::vector& dstDescs) const override { +// // static const std::vector> optimizedOrders = { +// // std::vector{0, 3, 1, 2}, +// // std::vector{0, 4, 1, 2, 3}, +// // std::vector{0, 5, 1, 2, 3, 4}, +// // }; +// // if (srcDescs[0]->hasLayoutType(LayoutType::ncsp) && +// // std::find(optimizedOrders.begin(), optimizedOrders.end(), transposeParams.permuteParams.order) != optimizedOrders.end()) { +// // return true; +// // } +// // DEBUG_LOG("RefOptimizedTransposeExecutor is not supported, because passed order is not optimized"); +// // return false; +// // } + +// ReorderExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { +// return std::make_shared(context); +// } +// }; + +class GenericReorderExecutor : public ReorderExecutor { +public: + using ReorderExecutor::ReorderExecutor; + + // bool init(const TransposeParams &transposeParams, + // const std::vector &srcDescs, + // const std::vector &dstDescs, + // const dnnl::primitive_attr &attr) override; + void exec(const std::vector &src, const std::vector &dst) override; + // impl_desc_type getImplType() const override { return implType; } +private: + // static const impl_desc_type implType = impl_desc_type::ref; +}; + class Reorder : public Node { public: Reorder(const std::shared_ptr& op, const GraphContext::CPtr context); @@ -88,6 +170,8 @@ class Reorder : public Node { void prepareReorderAsTranspose(MemoryDescPtr parentDesc, MemoryDescPtr childDesc); TransposeExecutorPtr transposeExecutor; #endif + bool testReorder = false; + ReorderExecutorPtr execPtr = nullptr; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index 923fcced1ef229..72208646d5d789 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -60,6 +60,8 @@ Transpose::Transpose(const std::shared_ptr& op, const GraphContext::CP void Transpose::getSupportedDescriptors() { } +size_t transposeCount = 0; + void Transpose::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; @@ -95,6 +97,42 @@ void Transpose::initSupportedPrimitiveDescriptors() { const auto& inputDataShape = getInputShapeAtPort(INPUT_DATA_IDX); const auto& outputDataShape = getOutputShapeAtPort(0); + + transposeCount++; + if (transposeCount == 1) { + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(prec, inputDataShape)); + config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(prec, outputDataShape)); + supportedPrimitiveDescriptorsBuilder(config, transposeParams); + return; + } else if (transposeCount == 2) { + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp8c)->createSharedDesc(prec, inputDataShape)); + config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp8c)->createSharedDesc(prec, outputDataShape)); + supportedPrimitiveDescriptorsBuilder(config, transposeParams); + return; + } + // if (transposeCount == 1) { + // config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, inputDataShape)); + // config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, outputDataShape)); + // supportedPrimitiveDescriptorsBuilder(config, transposeParams); + // return; + // } else if (transposeCount == 2) { + // config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(prec, inputDataShape)); + // config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(prec, outputDataShape)); + // supportedPrimitiveDescriptorsBuilder(config, transposeParams); + // return; + // } + // if (getName() == "TRANSPOSE_1") { + // config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, inputDataShape)); + // config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, outputDataShape)); + // supportedPrimitiveDescriptorsBuilder(config, transposeParams); + // return; + // } else if (getName() == "TRANSPOSE_2") { + // config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, inputDataShape)); + // config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, outputDataShape)); + // supportedPrimitiveDescriptorsBuilder(config, transposeParams); + // return; + // } + if (inputDataShape.getRank() == 4 || inputDataShape.getRank() == 5) { config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, inputDataShape)); config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, outputDataShape)); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index bff7379e4aa684..3fe040181376ae 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -257,9 +257,9 @@ void Transformations::PreLpt(const std::vector& defaultPrecis type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}}; CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); + // CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); CPU_REGISTER_PASS_COMMON(manager, ov::pass::WrapInterpolateIntoTransposes); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::TransposeSinking); + // CPU_REGISTER_PASS_COMMON(manager, ov::pass::TransposeSinking); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertSequenceToTensorIterator); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertOpSet3ToOpSet2); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertOpSet2ToOpSet1); @@ -287,11 +287,11 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // Common ConvertPrecision pass handles only a limited set of opevino operations to match the list of precisions supported by the plugin. // However, if the extension operation produces an output precision that is not natively supported, this may lead to inconsistency during // element type propagation. This transformation is called before the ConvertPrecision pass to align the actual precisions with the list of supported ones. - CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions, type_to_fuse); + // CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); + // CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions, type_to_fuse); CPU_REGISTER_PASS_COMMON(manager, ov::pass::EliminateConvert); - CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose); + // CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose); CPU_REGISTER_PASS_X64(manager, ConvertToInteraction); CPU_REGISTER_PASS_X64(manager, ConvertInteractionInt8); CPU_REGISTER_PASS_ARM(manager, ConvertReduceMultiAxis); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp index 029e97a2eae21d..e9e979362161e9 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp @@ -53,27 +53,45 @@ void TransposeLayerCPUTest::SetUp() { inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); outType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); - std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + // std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; updateSelectedType("unknown", inType, configuration); init_input_shapes({inputShapes}); - auto params = std::make_shared(inType, inputDynamicShapes[0]); + auto params = std::make_shared(ov::element::f32, inputDynamicShapes[0]); - const auto inputOrderOp = - std::make_shared(ov::element::i64, ov::Shape({inputOrder.size()}), inputOrder); - const auto transpose = std::make_shared(params, inputOrderOp); - transpose->get_rt_info() = getCPUInfo(); - const ov::ResultVector results{std::make_shared(transpose)}; + std::vector inputOrder1 = {0, 1, 2, 4, 3}; + const auto inputOrderOp1 = std::make_shared(ov::element::i64, + ov::Shape({inputOrder1.size()}), inputOrder1); + const auto transpose1 = std::make_shared(params, inputOrderOp1); + transpose1->set_friendly_name("TRANSPOSE_1"); + // transpose1->get_rt_info() = makeCPUInfo({nchw}, {}, {}); // todo: + + const auto convert = std::make_shared(transpose1, ov::element::f64); + convert->set_friendly_name("CONVERT"); + + // const auto addConst = + // std::make_shared(ov::element::f32, ov::Shape({1}), std::vector{1}); + + // const auto addNode = std::make_shared(transpose1, addConst); + + const auto inputOrderOp2 = std::make_shared(ov::element::i64, + ov::Shape({inputOrder1.size()}), inputOrder1); + const auto transpose2 = std::make_shared(convert, inputOrderOp2); + transpose2->set_friendly_name("TRANSPOSE_2"); + // transpose2->get_rt_info() = makeCPUInfo({nChw16c}, {}, {}); // todo: + + const ov::ResultVector results{std::make_shared(transpose2)}; function = std::make_shared(results, ov::ParameterVector{params}, "TransposeLayerCPUTest"); + functionRefs = ngraph::clone_function(*function); } TEST_P(TransposeLayerCPUTest, CompareWithRefs) { run(); - CheckPluginRelatedResults(compiledModel, "Transpose"); + // CheckPluginRelatedResults(compiledModel, "Transpose"); } namespace Transpose { diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp index 9f784d9be47766..9f0ea39fa5ffec 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp @@ -117,6 +117,36 @@ const std::vector CPUParams5D = { cpuParams_ncdhw, }; +const std::vector TEST_SHAPE = { + // InputShape{{-1, 39, -1, -1}, {{2, 39, 5, 6}}}, + InputShape{{-1, 48, -1, -1, -1}, {{2, 48, 5, 6, 7}}}, +}; + +const std::vector> TEST_ORDER = { + std::vector{0, 1, 3, 2}, // fake + // std::vector{0, 1, 3, 2}, +}; + +const std::vector TEST_NET_PRECISION = { + Precision::FP32 +}; + +const std::vector TEST_CPU_PARAMS = { + cpuParams_nCdhw16c, + // cpuParams_nChw8c, + // cpuParams_nchw, +}; + +INSTANTIATE_TEST_SUITE_P(TEST_TRANSPOSE, TransposeLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(TEST_SHAPE), + ::testing::ValuesIn(TEST_ORDER), + ::testing::ValuesIn(TEST_NET_PRECISION), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::ValuesIn(TEST_CPU_PARAMS)), + TransposeLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_staticShapes5DC16_Transpose, TransposeLayerCPUTest, ::testing::Combine( ::testing::ValuesIn(staticInputShapes5DC16), diff --git a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp index f559e8f945274e..ed75b502631b13 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp @@ -282,6 +282,11 @@ void compare(const ov::Tensor& expected, } Error abs_error(abs_threshold), rel_error(rel_threshold); + // for (size_t i = 0; i < shape_size_cnt && i < 100; ++i) { + // double expected_value = expected_data[i]; + // double actual_value = actual_data[i]; + // std::cout << i << ". expected_value = " << expected_value << ", actual_value = " << actual_value << std::endl; + // } for (size_t i = 0; i < shape_size_cnt; ++i) { double expected_value = expected_data[i]; double actual_value = actual_data[i]; diff --git a/src/tests/test_utils/functional_test_utils/src/crash_handler.cpp b/src/tests/test_utils/functional_test_utils/src/crash_handler.cpp index 25769df69e3476..1b19fe18846054 100644 --- a/src/tests/test_utils/functional_test_utils/src/crash_handler.cpp +++ b/src/tests/test_utils/functional_test_utils/src/crash_handler.cpp @@ -34,7 +34,7 @@ bool CrashHandler::IGNORE_CRASH = false; CrashHandler::CrashHandler(CONFORMANCE_TYPE type) { // setup default value for timeout in 15 minutes if (MAX_TEST_WORK_TIME == UINT_MAX) { - MAX_TEST_WORK_TIME = 900; + MAX_TEST_WORK_TIME = 9000; } sighandler crashHandler = [](int errCode) {