Skip to content

Commit

Permalink
If operation memory reuse
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorDuplensky committed Nov 12, 2024
1 parent b002dd0 commit da1d1a0
Show file tree
Hide file tree
Showing 8 changed files with 457 additions and 145 deletions.
14 changes: 14 additions & 0 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,24 @@ void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) {
m_data = decltype(m_data)(ptr, release);
}

// class MemoryUsage {
// public:
// MemoryUsage() {}

// ~MemoryUsage() {
// std::cout << "Total memory usage: " << total << "\n";
// }

// int total = 0;
// };

bool MemoryBlockWithReuse::resize(size_t size) {
// static MemoryUsage mu;

constexpr int cacheLineSize = 64;
bool sizeChanged = false;
if (size > m_memUpperBound) {
// mu.total += size;
void *ptr = dnnl::impl::malloc(size, cacheLineSize);
if (!ptr) {
OPENVINO_THROW("Failed to allocate ", size, " bytes of memory");
Expand Down
14 changes: 8 additions & 6 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,9 @@ static std::tuple<std::vector<NodePtr>, std::vector<size_t>> ExtractExecutableNo
std::vector<NodePtr> executableGraphNodes;
for (size_t i = 0; i < graphNodes.size(); i++) {
const auto& graphNode = graphNodes[i];
// if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
// if ((!graphNode->isConstant()) || // non-constant executable or
if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
// if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
(graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs
/* @todo
* Revise implementation.
Expand Down Expand Up @@ -941,9 +941,10 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
const auto& node = graphNodes[execIndex];
const auto inputExecIndex = execIndex + offset;
// an offset is the number of nodes in the internal graph minus the current node (-1)
offset = node->registerToAllocationContext(inputExecIndex, context) - 1;
offset = node->registerToAllocationContext(offset, context) - 1;
const auto outputExecIndex = execIndex + offset;
context.execIndex[node] = {inputExecIndex, outputExecIndex};
// std::cout << node->getName() << " - " << "[" << inputExecIndex << "," << outputExecIndex << "] offset " << offset << "\n";

if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) {
context.syncPoints.push_back(inputExecIndex);
Expand Down Expand Up @@ -1063,12 +1064,13 @@ static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters,
const auto& parent = edge->getParent();
const auto& child = edge->getChild();

// std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]"
int e_start = globalExecIndex.at(parent).second;
int e_finish = globalExecIndex.at(child).first;

// std::cout << "[" << e_start << " - " << e_finish << "]"
// << edge->name()
// << "\n";

int e_start = globalExecIndex.at(parent).second;
int e_finish = globalExecIndex.at(child).first;
// int e_finish = edge->getChild()->getExecIndex();

auto&& desc = edge->getDesc();
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/nodes/composite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void Composite::selectOptimalPrimitiveDescriptor() {
std::vector<Input::InputConfig> graphInputConfig;

// @todo should be always inplace after global memory reuse is fully supported by all the nodes
bool isInPlace = context->memoryReuseGlobal();
bool isInPlace = true;

for (size_t i = 0; i < getParentEdges().size(); i++) {
auto desc = getParentOutputMemDesc(getParentEdgeAt(i));
Expand Down
339 changes: 234 additions & 105 deletions src/plugins/intel_cpu/src/nodes/if.cpp

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions src/plugins/intel_cpu/src/nodes/if.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ class If : public Node {
If(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context);

static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
void initSupportedPrimitiveDescriptors() override;
void getSupportedDescriptors() override;
// void initSupportedPrimitiveDescriptors() override;
void getSupportedDescriptors() override {}
void selectOptimalPrimitiveDescriptor() override;
int registerToAllocationContext(int offset, AllocationContext& context) override;
void createPrimitive() override;
bool created() const override;
void execute(dnnl::stream strm) override;
Expand Down Expand Up @@ -60,8 +62,8 @@ class If : public Node {
ptrdiff_t size;
};

Graph subGraphThen;
Graph subGraphElse;
Graph m_thenGraph;
Graph m_elseGraph;
std::vector<std::deque<MemoryPtr>> inputMemThen, inputMemElse;
std::deque<MemoryPtr> outputMemThen, outputMemElse;

Expand All @@ -77,7 +79,7 @@ class If : public Node {
elseInputPortMap,
elseOutputPortMap;

const std::shared_ptr<ov::Node> ovOp;
const std::shared_ptr<ov::Node> m_op;
};

} // namespace node
Expand Down
7 changes: 3 additions & 4 deletions src/plugins/intel_cpu/src/nodes/lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void LoRA::selectOptimalPrimitiveDescriptor() {

inConfs.emplace_back(mainInputDesc);
// @todo should be always inplace after global memory reuse is fully supported by all the nodes
bool isInPlace = context->memoryReuseGlobal();
bool isInPlace = true;
graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, isInPlace});

for (size_t i = 1; i < getParentEdges().size(); i++) {
Expand Down Expand Up @@ -89,9 +89,8 @@ void LoRA::selectOptimalPrimitiveDescriptor() {
}

int LoRA::registerToAllocationContext(int offset, AllocationContext& context) {
if (!this->context->memoryReuseGlobal())
return Node::registerToAllocationContext(offset, context);

// if (!this->context->memoryReuseGlobal())
// return Node::registerToAllocationContext(offset, context);
for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
auto parentEdge = getParentEdgeAt(i);
auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0);
Expand Down
208 changes: 186 additions & 22 deletions src/plugins/intel_cpu/src/nodes/tensoriterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,33 +402,203 @@ TensorIterator::TensorIterator(const std::shared_ptr<ov::Node>& op, const GraphC
}
}

void TensorIterator::getSupportedDescriptors() {
auto tiOp = ov::as_type_ptr<const ov::op::util::SubGraphOp>(ngraphOp);
if (!tiOp) {
THROW_ERROR("cannot be cast to ov::op::util::SubGraphOp");
// void TensorIterator::getSupportedDescriptors() {
void TensorIterator::selectOptimalPrimitiveDescriptor() {
// auto tiOp = ov::as_type_ptr<const ov::op::util::SubGraphOp>(ngraphOp);
// if (!tiOp) {
// THROW_ERROR("cannot be cast to ov::op::util::SubGraphOp");
// }
// const std::shared_ptr<const ov::Model> body = tiOp->get_function();
// sub_graph.CreateGraph(body, context);

// const auto &inMap = sub_graph.GetInputNodesMap();
// for (const auto &param : tiOp->get_function()->get_parameters()) {
// auto inNode = inMap.find(tiOp->get_function()->get_parameter_index(param));
// if (inNode != inMap.end()) {
// input_mems.push_back(getToMemories(inNode->second.get(), 0));
// }
// }

// const auto &outMap = sub_graph.GetOutputNodesMap();
// for (const auto &out : tiOp->get_function()->get_results()) {
// auto outNode = outMap.find(tiOp->get_function()->get_result_index(out));
// if (outNode != outMap.end()) {
// auto outMem = outNode->second->getSrcMemoryAtPort(0);
// output_mem.push_back(outMem);
// }
// }

// // Port map: outputs
// for (const auto& desc : tiOp->get_output_descriptions()) {
// auto body_output_idx = desc->m_body_value_index;

// std::string type_name = desc->get_type_info().name;
// if (type_name == "ConcatOutputDescription") {
// auto output_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::ConcatOutputDescription>(desc);
// OPENVINO_ASSERT(output_desc != nullptr);

// outputPortMap.emplace_back(PortMap {
// static_cast<int>(output_desc->m_output_index), static_cast<int>(body_output_idx),
// static_cast<int>(output_desc->m_axis), static_cast<int>(output_desc->m_stride),
// static_cast<int>(output_desc->m_start), static_cast<int>(output_desc->m_end),
// static_cast<int>(output_desc->m_part_size)});
// } else if (type_name == "BodyOutputDescription") {
// auto output_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::BodyOutputDescription>(desc);
// OPENVINO_ASSERT(output_desc != nullptr);

// outputPortMap.emplace_back(PortMap {
// static_cast<int>(output_desc->m_output_index), static_cast<int>(body_output_idx), -1, 1, 0, -1, 1});
// } else {
// OPENVINO_THROW("Incorrect type of the output description.");
// }
// }

// // Port map : inputs and back edges
// for (const auto& desc : tiOp->get_input_descriptions()) {
// auto body_input_index = desc->m_body_parameter_index;

// if (auto slice_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::SliceInputDescription>(desc)) {
// inputPortMap.emplace_back(PortMap {
// static_cast<int>(slice_desc->m_input_index), static_cast<int>(body_input_index),
// static_cast<int>(slice_desc->m_axis), static_cast<int>(slice_desc->m_stride),
// static_cast<int>(slice_desc->m_start), static_cast<int>(slice_desc->m_end),
// static_cast<int>(slice_desc->m_part_size)});
// } else if (auto merge_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::MergedInputDescription>(desc)) {
// inputPortMap.emplace_back(PortMap {
// static_cast<int>(merge_desc->m_input_index), static_cast<int>(body_input_index), -1, 1, 0, -1, 1});

// auto body_output_idx = merge_desc->m_body_value_index;

// backEdges.emplace_back(PortMap {
// static_cast<int>(body_output_idx), static_cast<int>(body_input_index), -1, 1, 0, -1, 1});
// } else if (auto inv_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::InvariantInputDescription>(desc)) {
// inputPortMap.emplace_back(PortMap {
// static_cast<int>(inv_desc->m_input_index), static_cast<int>(body_input_index), -1, 1, 0, -1, 1});
// } else {
// THROW_ERROR("has incorrect type of the input description.");
// }
// }

// if (auto loopOp = ov::as_type_ptr<const ov::op::v5::Loop>(ngraphOp)) {
// algorithm = Algorithm::TensorIteratorLoop;
// auto spec_port = loopOp->get_special_body_ports();
// if (spec_port.current_iteration_input_idx != -1) {
// loopBodyCurrentIterationIdx.push_back(spec_port.current_iteration_input_idx);
// }
// if (spec_port.body_condition_output_idx != -1) {
// loopBodyConditionOutputIdx = spec_port.body_condition_output_idx;
// }
// loopTripCountIdx = 0;
// loopExecutionConditionIdx = 1;
// } else if (auto ti = ov::as_type_ptr<const ov::op::v0::TensorIterator>(ngraphOp)) {
// algorithm = Algorithm::TensorIteratorCommon;
// } else {
// THROW_ERROR("isn't supported!");
// }
// supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown);
// selectPrimitiveDescriptorByIndex(0);

// for the input configuration, just always use the parent configuration
auto subgraphOp = ov::as_type_ptr<const ov::op::util::SubGraphOp>(ngraphOp);
const auto numParameters = subgraphOp->get_function()->get_parameters().size();
const auto numResults = subgraphOp->get_function()->get_results().size();

std::vector<PortConfig> inConfs(inputShapes.size());
std::vector<PortConfig> outConfs(outputShapes.size());

std::vector<Input::InputConfig> inputConfig(numParameters);
std::vector<Input::OutputConfig> outputConfig(numResults);

// @todo should be always inplace when global memory reuse is fully supported by all the nodes
bool isInPlace = false;

for (const auto& description : subgraphOp->get_output_descriptions()) {
const auto outIdx = description->m_output_index;
const auto resultIdx = description->m_body_value_index;

const auto &origShape = subgraphOp->get_output_partial_shape(outIdx);
const auto& shape = Shape(origShape.rank().get_length() == 0 ? ov::PartialShape{1} : origShape);
const auto prec = subgraphOp->get_output_element_type(outIdx);

auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp);
auto desc = descCreator->createSharedDesc(prec, shape);

outConfs.at(outIdx) = PortConfig(desc);
outputConfig.at(resultIdx) = node::Input::OutputConfig{desc, isInPlace};
}

auto inputDescriptions = subgraphOp->get_input_descriptions();

for (const auto& description : inputDescriptions) {
const auto inIdx = description->m_input_index;
const auto paramIdx = description->m_body_parameter_index;

const auto &origShape = subgraphOp->get_input_partial_shape(inIdx);
const auto& shape = Shape(origShape.rank().get_length() == 0 ? ov::PartialShape{1} : origShape);
const auto prec = subgraphOp->get_input_element_type(inIdx);

auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp);
auto desc = descCreator->createSharedDesc(prec, shape);

// auto desc = getParentOutputMemDesc(getParentEdgeAt(inIdx));
inConfs.at(inIdx) = PortConfig(desc);
inputConfig.at(paramIdx) = node::Input::InputConfig{desc, isInPlace};
}
const std::shared_ptr<const ov::Model> body = tiOp->get_function();
sub_graph.CreateGraph(body, context);

// configure the inner graph to get the information about output memory descriptors
// sub_graph.Init(subgraphOp->get_function(), context, inputConfig, outputConfig);
sub_graph.Init(subgraphOp->get_function(), context);

// for the output descriptors, use the configuration of the graph's output nodes
// auto outputDescriptors = sub_graph.getOutputMemoryDescriptors();
// auto outputDescriptions = subgraphOp->get_output_descriptions();

// for (const auto& description : outputDescriptions) {
// auto outIdx = description->m_output_index;
// auto resultIdx = description->m_body_value_index;
// outConfs.at(outIdx) = PortConfig(outputDescriptors.at(resultIdx));
// }

const NodeConfig config(inConfs, outConfs);

supportedPrimitiveDescriptors.clear();
supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown);
// supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef);

selectPrimitiveDescriptorByIndex(0);
}

// void TensorIterator::initSupportedPrimitiveDescriptors() {
// if (!supportedPrimitiveDescriptors.empty())
// return;

// supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown);
// }

void TensorIterator::createPrimitive() {
sub_graph.Activate();

auto subgraphOp = ov::as_type_ptr<const ov::op::util::SubGraphOp>(ngraphOp);

const auto &inMap = sub_graph.GetInputNodesMap();
for (const auto &param : tiOp->get_function()->get_parameters()) {
auto inNode = inMap.find(tiOp->get_function()->get_parameter_index(param));
for (const auto &param : subgraphOp->get_function()->get_parameters()) {
auto inNode = inMap.find(subgraphOp->get_function()->get_parameter_index(param));
if (inNode != inMap.end()) {
input_mems.push_back(getToMemories(inNode->second.get(), 0));
}
}

const auto &outMap = sub_graph.GetOutputNodesMap();
for (const auto &out : tiOp->get_function()->get_results()) {
auto outNode = outMap.find(tiOp->get_function()->get_result_index(out));
for (const auto &out : subgraphOp->get_function()->get_results()) {
auto outNode = outMap.find(subgraphOp->get_function()->get_result_index(out));
if (outNode != outMap.end()) {
auto outMem = outNode->second->getSrcMemoryAtPort(0);
output_mem.push_back(outMem);
}
}

// Port map: outputs
for (const auto& desc : tiOp->get_output_descriptions()) {
for (const auto& desc : subgraphOp->get_output_descriptions()) {
auto body_output_idx = desc->m_body_value_index;

std::string type_name = desc->get_type_info().name;
Expand All @@ -453,7 +623,7 @@ void TensorIterator::getSupportedDescriptors() {
}

// Port map : inputs and back edges
for (const auto& desc : tiOp->get_input_descriptions()) {
for (const auto& desc : subgraphOp->get_input_descriptions()) {
auto body_input_index = desc->m_body_parameter_index;

if (auto slice_desc = ov::as_type_ptr<const ov::op::util::SubGraphOp::SliceInputDescription>(desc)) {
Expand Down Expand Up @@ -494,16 +664,7 @@ void TensorIterator::getSupportedDescriptors() {
} else {
THROW_ERROR("isn't supported!");
}
}

void TensorIterator::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;

supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown);
}

void TensorIterator::createPrimitive() {
if (loopBodyConditionOutputIdx == -1)
continue_cond_check.reset(new staticValueCheck(true)); // always true
if (loopExecutionConditionIdx == -1) {
Expand All @@ -521,6 +682,10 @@ void TensorIterator::createPrimitive() {
}
}

int TensorIterator::registerToAllocationContext(int offset, AllocationContext& context) {
return sub_graph.RegisterToAllocationContext(offset, context);
}

bool TensorIterator::needPrepareParams() const {
if (getAlgorithm() == Algorithm::TensorIteratorLoop) {
const auto tripCountPtr = getSrcDataAtPortAs<const uint32_t>(loopTripCountIdx);
Expand Down Expand Up @@ -876,7 +1041,6 @@ int TensorIterator::getNumIteration(const std::vector<PortMap>& inputPortMap, co
return static_cast<int>(length / step);
};


int numIterations = 1;
bool isDefault = true;
for (const auto& rule : inputPortMap) {
Expand Down
Loading

0 comments on commit da1d1a0

Please sign in to comment.