Added block size settings via environment + perf counter only for mha

a-sidorova · Nov 12, 2024 · 8e85f8e · 8e85f8e
1 parent 602f701
commit 8e85f8e
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 56 deletions.
diff --git a/samples/cpp/common/utils/include/samples/common.hpp b/samples/cpp/common/utils/include/samples/common.hpp
@@ -386,41 +386,10 @@ static UNUSED void printPerformanceCounts(std::vector<ov::ProfilingInfo> perform
             totalTimeCpu += it.cpu_time;
         }
 
-        std::string toPrint(it.node_name);
-        const int maxPrintLength = 20;
-
-        if (it.node_name.length() >= maxPrintLength) {
-            toPrint = it.node_name.substr(0, maxPrintLength - 5);
-            toPrint += "...";
-        }
-
-        stream << std::setw(maxPrintLength) << std::left << toPrint << " ";
-        switch (it.status) {
-        case ov::ProfilingInfo::Status::EXECUTED:
-            stream << std::setw(21) << std::left << "EXECUTED ";
-            break;
-        case ov::ProfilingInfo::Status::NOT_RUN:
-            stream << std::setw(21) << std::left << "NOT_RUN ";
-            break;
-        case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
-            stream << std::setw(21) << std::left << "OPTIMIZED_OUT ";
-            break;
-        }
-
-        stream << "layerType: ";
-        if (it.node_type.length() >= maxPrintLength) {
-            stream << std::setw(maxPrintLength) << std::left << it.node_type.substr(0, maxPrintLength - 3) + "..."
-                   << " ";
-        } else {
-            stream << std::setw(maxPrintLength) << std::left << it.node_type << " ";
-        }
-
-        stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) << " ";
-        stream << "realTime (ms): " << std::setw(10) << std::left << std::fixed << std::setprecision(3)
-               << it.real_time.count() / 1000.0 << " ";
-        stream << "cpuTime (ms): " << std::setw(10) << std::left << std::fixed << std::setprecision(3)
-               << it.cpu_time.count() / 1000.0 << " ";
-        stream << std::endl;
+        if (it.node_name.find("_MHA") == std::string::npos)
+            continue;
+
+        stream << "SDPA_TIME: " << std::fixed << std::setprecision(3) << it.cpu_time.count() / 1000.0 << std::endl;
     }
     stream << std::setw(25) << std::left << "Total time: " << std::fixed << std::setprecision(3)
            << totalTime.count() / 1000.0 << " milliseconds" << std::endl;

diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -563,7 +563,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         auto subgraph = std::make_shared<op::Subgraph>(subgraph_inputs, body);
         // Copy runtime info from last node to subgraph - to copy topological order
         copy_runtime_info(last_node, subgraph);
-        subgraph->set_friendly_name(last_node->get_friendly_name());
+        subgraph->set_friendly_name(last_node->get_friendly_name() + "_MHA");
 
         for (size_t i = 0; i < subgraph->get_output_size(); ++i) {
             for (const auto& target_input : subgraph_result_inputs[i]) {

diff --git a/src/inference/src/dev/isync_infer_request.cpp b/src/inference/src/dev/isync_infer_request.cpp
@@ -273,20 +273,20 @@ void ov::ISyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
                     tensor->get_element_type(),
                     " != ",
                     port.get_element_type());
-    bool is_dynamic = port.get_partial_shape().is_dynamic();
-    OPENVINO_ASSERT(is_dynamic || port.get_shape() == tensor->get_shape(),
-                    "The ",
-                    tensor_type,
-                    " tensor size is not equal to the model ",
-                    tensor_type,
-                    " type: got ",
-                    tensor->get_shape(),
-                    " expecting ",
-                    port.get_shape(),
-                    ".");
-    OPENVINO_ASSERT(
-        std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr || is_dynamic,
-        "Tensor data equal nullptr!");
+    //bool is_dynamic = port.get_partial_shape().is_dynamic();
+    //OPENVINO_ASSERT(is_dynamic || port.get_shape() == tensor->get_shape(),
+    //                "The ",
+    //                tensor_type,
+    //                " tensor size is not equal to the model ",
+    //                tensor_type,
+    //                " type: got ",
+    //                tensor->get_shape(),
+    //                " expecting ",
+    //                port.get_shape(),
+    //                ".");
+    //OPENVINO_ASSERT(
+    //    std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr || is_dynamic,
+    //    "Tensor data equal nullptr!");
 }
 
 void ov::ISyncInferRequest::allocate_tensor(

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp
@@ -48,18 +48,34 @@ size_t BrgemmCPUBlocking::get_default_n_blk(size_t n) const {
     return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? 64 : 24;
 }
 
+static size_t count = 0;
+
 std::tuple<size_t, size_t, size_t> BrgemmCPUBlocking::get_blocking_params(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) const {
     const auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(brgemm_expr->get_node());
     OPENVINO_ASSERT(brgemm, "BrgemmCPU is expected!");
 
     size_t m_blk, n_blk, k_blk;
     std::tie(m_blk, n_blk, k_blk) = BrgemmBlockingBase::get_blocking_params(brgemm_expr);
-    // Note: K,N blocking is functionally enabled, need to turn it on after blocking heuristic is updated to cover
-    // the low precision cases (ticket: 156014)
-    if (with_repacking(brgemm->get_type())) {
-        n_blk = get_full_dim_value();
-        k_blk = get_full_dim_value();
-    }
+    //// Note: K,N blocking is functionally enabled, need to turn it on after blocking heuristic is updated to cover
+    //// the low precision cases (ticket: 156014)
+    //if (with_repacking(brgemm->get_type())) {
+    //    n_blk = get_full_dim_value();
+    //    k_blk = get_full_dim_value();
+    //}
+
+    auto m_string = std::string("M_blk");
+    auto k_string = std::string("K" + std::to_string(count % 2) + "_blk");
+    auto n_string = std::string("N" + std::to_string(count % 2) + "_blk");
+
+    if (const auto* m = std::getenv(m_string.c_str()))
+        m_blk = atoi(m);
+    if (const auto* k = std::getenv(k_string.c_str()))
+        k_blk = atoi(k);
+    if (const auto* n = std::getenv(n_string.c_str()))
+        n_blk = atoi(n);
+
+    ++count;
+
     return std::make_tuple(m_blk, n_blk, k_blk);
 }