Skip to content

Commit

Permalink
Added block size settings via environment + perf counter only for mha
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Nov 12, 2024
1 parent 602f701 commit 8e85f8e
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 56 deletions.
39 changes: 4 additions & 35 deletions samples/cpp/common/utils/include/samples/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,41 +386,10 @@ static UNUSED void printPerformanceCounts(std::vector<ov::ProfilingInfo> perform
totalTimeCpu += it.cpu_time;
}

std::string toPrint(it.node_name);
const int maxPrintLength = 20;

if (it.node_name.length() >= maxPrintLength) {
toPrint = it.node_name.substr(0, maxPrintLength - 5);
toPrint += "...";
}

stream << std::setw(maxPrintLength) << std::left << toPrint << " ";
switch (it.status) {
case ov::ProfilingInfo::Status::EXECUTED:
stream << std::setw(21) << std::left << "EXECUTED ";
break;
case ov::ProfilingInfo::Status::NOT_RUN:
stream << std::setw(21) << std::left << "NOT_RUN ";
break;
case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
stream << std::setw(21) << std::left << "OPTIMIZED_OUT ";
break;
}

stream << "layerType: ";
if (it.node_type.length() >= maxPrintLength) {
stream << std::setw(maxPrintLength) << std::left << it.node_type.substr(0, maxPrintLength - 3) + "..."
<< " ";
} else {
stream << std::setw(maxPrintLength) << std::left << it.node_type << " ";
}

stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) << " ";
stream << "realTime (ms): " << std::setw(10) << std::left << std::fixed << std::setprecision(3)
<< it.real_time.count() / 1000.0 << " ";
stream << "cpuTime (ms): " << std::setw(10) << std::left << std::fixed << std::setprecision(3)
<< it.cpu_time.count() / 1000.0 << " ";
stream << std::endl;
if (it.node_name.find("_MHA") == std::string::npos)
continue;

stream << "SDPA_TIME: " << std::fixed << std::setprecision(3) << it.cpu_time.count() / 1000.0 << std::endl;
}
stream << std::setw(25) << std::left << "Total time: " << std::fixed << std::setprecision(3)
<< totalTime.count() / 1000.0 << " milliseconds" << std::endl;
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/pass/mha_tokenization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
auto subgraph = std::make_shared<op::Subgraph>(subgraph_inputs, body);
// Copy runtime info from last node to subgraph - to copy topological order
copy_runtime_info(last_node, subgraph);
subgraph->set_friendly_name(last_node->get_friendly_name());
subgraph->set_friendly_name(last_node->get_friendly_name() + "_MHA");

for (size_t i = 0; i < subgraph->get_output_size(); ++i) {
for (const auto& target_input : subgraph_result_inputs[i]) {
Expand Down
28 changes: 14 additions & 14 deletions src/inference/src/dev/isync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,20 +273,20 @@ void ov::ISyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
tensor->get_element_type(),
" != ",
port.get_element_type());
bool is_dynamic = port.get_partial_shape().is_dynamic();
OPENVINO_ASSERT(is_dynamic || port.get_shape() == tensor->get_shape(),
"The ",
tensor_type,
" tensor size is not equal to the model ",
tensor_type,
" type: got ",
tensor->get_shape(),
" expecting ",
port.get_shape(),
".");
OPENVINO_ASSERT(
std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr || is_dynamic,
"Tensor data equal nullptr!");
//bool is_dynamic = port.get_partial_shape().is_dynamic();
//OPENVINO_ASSERT(is_dynamic || port.get_shape() == tensor->get_shape(),
// "The ",
// tensor_type,
// " tensor size is not equal to the model ",
// tensor_type,
// " type: got ",
// tensor->get_shape(),
// " expecting ",
// port.get_shape(),
// ".");
//OPENVINO_ASSERT(
// std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr || is_dynamic,
// "Tensor data equal nullptr!");
}

void ov::ISyncInferRequest::allocate_tensor(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,34 @@ size_t BrgemmCPUBlocking::get_default_n_blk(size_t n) const {
return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? 64 : 24;
}

static size_t count = 0;

std::tuple<size_t, size_t, size_t> BrgemmCPUBlocking::get_blocking_params(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) const {
const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(brgemm_expr->get_node());
OPENVINO_ASSERT(brgemm, "BrgemmCPU is expected!");

size_t m_blk, n_blk, k_blk;
std::tie(m_blk, n_blk, k_blk) = BrgemmBlockingBase::get_blocking_params(brgemm_expr);
// Note: K,N blocking is functionally enabled, need to turn it on after blocking heuristic is updated to cover
// the low precision cases (ticket: 156014)
if (with_repacking(brgemm->get_type())) {
n_blk = get_full_dim_value();
k_blk = get_full_dim_value();
}
//// Note: K,N blocking is functionally enabled, need to turn it on after blocking heuristic is updated to cover
//// the low precision cases (ticket: 156014)
//if (with_repacking(brgemm->get_type())) {
// n_blk = get_full_dim_value();
// k_blk = get_full_dim_value();
//}

auto m_string = std::string("M_blk");
auto k_string = std::string("K" + std::to_string(count % 2) + "_blk");
auto n_string = std::string("N" + std::to_string(count % 2) + "_blk");

if (const auto* m = std::getenv(m_string.c_str()))
m_blk = atoi(m);
if (const auto* k = std::getenv(k_string.c_str()))
k_blk = atoi(k);
if (const auto* n = std::getenv(n_string.c_str()))
n_blk = atoi(n);

++count;

return std::make_tuple(m_blk, n_blk, k_blk);
}

Expand Down

0 comments on commit 8e85f8e

Please sign in to comment.