openvinotoolkit · aobolensk · Jan 17, 2025 · Jan 17, 2025 · Jan 24, 2025 · IvanNovoselov
@@ -0,0 +1,20 @@
+# Snippet parameters dump
+
+The pass dumps selected properties of some performance-critical operations in Subgraphs. Only MatMuls are currently supported by this pass.
+
+To turn on snippet properties dump feature, the following environment variable should be used:
+```sh
+    OV_SNIPPETS_DUMP_BRGEMM_PARAMS="path=<path_to_csv_dump_file>" binary ...
+```
+
+Examples:
+```sh
+    OV_SNIPPETS_DUMP_BRGEMM_PARAMS="path=brgemm.csv" binary ...
+```
+
+Output example:
-Output example:
+Output example:
+
-Output example:
+Output example:
+
+
+| subgraph_name      | name       | in_type     | out_type | in_shapes                           | out_shapes           | in_layouts               | out_layouts | M   | N   | K   | m_block | n_block  | k_block  | acc_max_time  | avg_max_time  |
+|--------------------|------------|-------------|----------|-------------------------------------|----------------------|--------------------------|-------------|-----|-----|-----|---------|----------|----------|---------------|---------------|
+| FakeQuantitze_457  | MatMul_438 | i8;i8;f32   | i32      | 1 16 128 64;1 16 64 128;1 16 64 128 | 1 16 128 128         | 0 2 1 3;0 1 2 3;0 1 2 3; | 0 1 2 3;    | 128 | 128 | 64  | 32      | FULL_DIM | FULL_DIM | 41482         | 5185          |
+| FakeQuantitze_457  | MatMul_452 | u8;i8       | i32      | 1 16 128 128;1 16 128 64            | 1 16 128 64          | 0 1 2 3;0 1 2 3;         | 0 1 2 3;    | 128 | 64  | 128 | 32      | FULL_DIM | FULL_DIM | 39427         | 4928          |
@@ -0,0 +1,170 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef SNIPPETS_DEBUG_CAPS
+
+#pragma once
+
+#include "snippets/itt.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/lowered/specific_loop_iter_handlers.hpp"
+#include "snippets/lowered/pass/iter_handler.hpp"
+#include "snippets/op/brgemm.hpp"
+#include "snippets/utils/utils.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface BrgemmDebugParams
+ * @brief Brgemm parameters dump pass
+ * @ingroup snippets
+ */
+template <typename BRGEMM_TYPE,
+          typename std::enable_if<std::is_base_of<ov::snippets::op::Brgemm, BRGEMM_TYPE>::value, bool>::type = true>
+class BrgemmDebugParams : public snippets::lowered::pass::RangedPass {
+public:
+    BrgemmDebugParams(const std::string& subgraph_name) : m_subgraph_name(subgraph_name) {}
+    OPENVINO_RTTI("BrgemmDebugParams", "", RangedPass);
+
+    bool run(snippets::lowered::LinearIR& linear_ir,
+             snippets::lowered::LinearIR::constExprIt begin,
+             snippets::lowered::LinearIR::constExprIt end) override final {  // NOLINT
+        OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmDebugParams")
+        if (linear_ir.get_config().debug_config.dumpParams.csv_path.empty()) {
+            return false;
+        }
+        static size_t seq_number = 0;
+        bool modified = false;
+        auto csv_path = linear_ir.get_config().debug_config.dumpParams.csv_path;
+        for (auto expr_it = begin; expr_it != end; expr_it++) {
+            const auto& brgemm_expr = *expr_it;
+            const auto brgemm = ov::as_type_ptr<BRGEMM_TYPE>(brgemm_expr->get_node());
+            if (!brgemm)
+                continue;
+            // Collect brgemm parameters
+            auto params = collect_params(brgemm_expr, linear_ir);
+            const auto& perf_count_begin = std::make_shared<snippets::op::PerfCountBegin>();
+            perf_count_begin->set_friendly_name(std::string("PerfCount_Begin_") + std::to_string(seq_number) +
+                                                "_DebugParams");
+            const auto empty_inputs = std::vector<PortConnectorPtr>{};
+            linear_ir.insert_node(perf_count_begin, empty_inputs, expr_it->get()->get_loop_ids(), false, expr_it);
+
+            const auto& perf_count_end = std::make_shared<snippets::op::PerfCountEnd>(perf_count_begin->output(0));
+            perf_count_end->set_friendly_name(std::string("PerfCount_End_") + std::to_string(seq_number) +
+                                              "_DebugParams");
+            // Attach brgemm parameters to PerfCountEnd node
+            perf_count_end->get_rt_info()["brgemm_params"] = params;
+            perf_count_end->get_rt_info()["brgemm_params_csv_path"] = csv_path;
+            linear_ir.insert_node(perf_count_end, empty_inputs, expr_it->get()->get_loop_ids(), false, next(expr_it));
+            seq_number++;
+            modified = true;
+        }
+        return modified;
+    }
+
+private:
+    std::string collect_params(const ov::snippets::lowered::ExpressionPtr& brgemm_expr,
+                               const snippets::lowered::LinearIR& linear_ir) {
+        const auto brgemm = ov::as_type_ptr<BRGEMM_TYPE>(brgemm_expr->get_node());
+        OPENVINO_ASSERT(brgemm, "Brgemm is nullptr!");
+        std::stringstream ss;
+        ss << m_subgraph_name << ',';
+        ss << brgemm_expr->get_node()->get_friendly_name() << ',';
+        for (size_t i = 0; i < brgemm->get_input_size(); ++i) {
+            ss << brgemm->get_input_element_type(i);
+            if (i != brgemm->get_input_size() - 1) {
+                ss << ';';
+            }
+        }
+        ss << ',';
+        for (size_t i = 0; i < brgemm->get_output_size(); ++i) {
+            ss << brgemm->get_output_element_type(i);
+            if (i != brgemm->get_output_size() - 1) {
+                ss << ';';
+            }
+        }
+        ss << ',';
+        for (size_t i = 0; i < brgemm->inputs().size(); ++i) {
+            const auto& port_desc = brgemm_expr->get_input_port_descriptor(i);
+            const auto& shape = ov::snippets::utils::get_planar_vdims(port_desc->get_shape(), port_desc->get_layout());
+            ss << utils::tensor2str(shape, " ");
+            ss << ';';
+        }
+        ss.seekp(-1, ss.cur);
+        ss << ',';
+        for (size_t i = 0; i < brgemm->outputs().size(); ++i) {
+            const auto& port_desc = brgemm_expr->get_output_port_descriptor(i);
+            const auto& shape =
+                ov::snippets::utils::get_preordered_vdims(port_desc->get_shape(), port_desc->get_layout());
+            ss << utils::tensor2str(shape, " ");
+            ss << ';';
+        }
+        ss.seekp(-1, ss.cur);
+        ss << ',';
+        for (size_t i = 0; i < brgemm->inputs().size(); ++i) {
+            const auto& port_desc = brgemm_expr->get_input_port_descriptor(i);
+            ss << utils::tensor2str(port_desc->get_layout(), " ");
+            ss << ';';
+        }
+        ss << ',';
+        for (size_t i = 0; i < brgemm->outputs().size(); ++i) {
+            const auto& port_desc = brgemm_expr->get_output_port_descriptor(i);
+            ss << utils::tensor2str(port_desc->get_layout(), " ");
+            ss << ';';
+        }
+        ss << ',';
+
+        const auto& in_0_desc = brgemm_expr->get_input_port_descriptor(0);
+        const auto& in_1_desc = brgemm_expr->get_input_port_descriptor(1);
+        const auto& out_desc = brgemm_expr->get_output_port_descriptor(0);
+
+        const auto& in_0_planar_dims =
+            ov::snippets::utils::get_planar_vdims(in_0_desc->get_shape(), in_0_desc->get_layout());
+        const auto& in_1_planar_dims =
+            ov::snippets::utils::get_planar_vdims(in_1_desc->get_shape(), in_1_desc->get_layout());
+        const auto& out_preordered_dims =
+            ov::snippets::utils::get_preordered_vdims(out_desc->get_shape(), out_desc->get_layout());
+
+        const auto& m = *++out_preordered_dims.rbegin();
+        const auto& n = *out_preordered_dims.rbegin();
+        const auto& k0 = *in_0_planar_dims.rbegin();
+        const auto& k1 = *++in_1_planar_dims.rbegin();
+        size_t k = 0;
+        OPENVINO_ASSERT(utils::merge_dynamic_dim(k, k0, k1),
+                        "Brgemm input descriptors have incompatible K dimension value.");
+        ss << static_cast<int64_t>(m) << ',' << static_cast<int64_t>(n) << ',' << static_cast<int64_t>(k) << ',';
+
+        size_t m_block = in_0_desc->get_subtensor().front();
+        size_t n_block = in_1_desc->get_subtensor().back();
+        size_t k_block = out_desc->get_subtensor().back();
+
+        auto append_block_info = [&](size_t block) {
+            if (block == utils::get_full_dim_value()) {
+                ss << "FULL_DIM";
+            } else if (block == utils::get_dynamic_value<size_t>()) {
+                ss << "?";
+            } else {
+                ss << block;
+            }
+            ss << ',';
+        };
+
+        append_block_info(m_block);
+        append_block_info(n_block);
+        append_block_info(k_block);
+        return ss.str();
+    }
+
+    std::string m_subgraph_name;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
+#endif  // SNIPPETS_DEBUG_CAPS
@@ -74,20 +74,25 @@ class PerfCountEnd : public PerfCountEndBase {
 public:
     OPENVINO_OP("PerfCountEnd", "SnippetsOpset", PerfCountEndBase);
     PerfCountEnd(const Output<Node>& pc_begin);
-    PerfCountEnd() = default;
-    ~PerfCountEnd() {
-        output_perf_count();
-    }
+    PerfCountEnd();
+    ~PerfCountEnd();
+
     void output_perf_count();
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
 
     void init_pc_begin();
     void set_accumulated_time();
 
+    void dump_brgemm_params_to_csv();
+
 private:
     ov::threading::ThreadLocal<uint64_t> accumulation;
     ov::threading::ThreadLocal<uint32_t> iteration;
     std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
+
+    static std::string brgemm_csv_path;
+    static std::map<std::string, std::string> m_debug_params_map;
+    static size_t nodes_count;
 };
 
 } // namespace op

@@ -60,6 +60,15 @@ class DebugCapsConfig {
         }
     } dumpLIR;
 
+    struct : PropertyGroup {
+        std::string csv_path;
+        std::vector<PropertySetterPtr> getPropertySetters() override {
+            return {
+                PropertySetterPtr(new StringPropertySetter("path", csv_path, "path to dumped brgemm params")),
+            };
+        }
+    } dumpParams;
+
     // Snippets performance count mode
     // Disabled - default, w/o perf count for snippets
     // Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf

@@ -324,6 +324,15 @@ void visit_path(const lowered::ExpressionPtr& expr,
                 std::function<void(lowered::ExpressionPtr)> func,
                 bool visit_parent_path);
 
+/**
+ * @brief Converts a tensor to a string representation.
+ *        Each value in the tensor is converted to a string. If the value is a full dimension, it is represented as
+ * "FULL_DIM". If the value is dynamic, it is represented as "?".
+ * @param tensor The tensor to be converted to a string.
+ * @return A string representation of the tensor.
+ */
+std::string tensor2str(const VectorDims& tensor, const std::string& delimiter = ", ");
+
 } // namespace utils
 } // namespace snippets
 } // namespace ov
@@ -170,18 +170,6 @@ ExpressionPtr Expression::clone() const {
 }
 
 bool Expression::visit_attributes(AttributeVisitor &visitor) {
-    auto subtensor2str = [](const VectorDims& subtensor) {
-        std::stringstream ss;
-        for (size_t i = 0; i < subtensor.size(); ++i) {
-            const auto& v = subtensor[i];
-            const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" :
-                               utils::is_dynamic_value(v)  ? "?" : std::to_string(v);
-            const auto del = i < subtensor.size() - 1 ? ", " : "";
-            ss << v_str << del;
-        }
-        return ss.str();
-    };
-
     std::ostringstream in_regs, out_regs;
     std::vector<std::pair<std::string, ov::PartialShape>> shapes;
     std::vector<std::pair<std::string, std::string>> subtensors;
@@ -194,7 +182,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
 
         const auto& subtensor = desc->get_subtensor();
         if (!subtensor.empty())
-            subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor));
+            subtensors.emplace_back("in_subtensor_" + std::to_string(i), utils::tensor2str(subtensor));
 
         const auto& layout = desc->get_layout();
         if (!layout.empty() && !utils::is_planar_layout(layout))
@@ -210,7 +198,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
 
         const auto& subtensor = desc->get_subtensor();
         if (!subtensor.empty())
-            subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor));
+            subtensors.emplace_back("out_subtensor_" + std::to_string(i), utils::tensor2str(subtensor));
 
         const auto& layout = desc->get_layout();
         if (!layout.empty() && !utils::is_planar_layout(layout))

@@ -153,9 +153,14 @@ bool Validate::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lo
         if (found != m_validation_map.cend()) {
             (found->second)(expr, linear_ir);
         }
-        OPENVINO_ASSERT(expr->get_output_count() == node->get_output_size() ||
-                        ov::is_type<op::LoopEnd>(node) ||
-                        ov::is_type<ov::op::v0::Result>(node), "Incorrect count of output port descriptors!");
+        bool bypass_output_size_check =
+#ifdef SNIPPETS_DEBUG_CAPS
+            ov::is_type<snippets::op::PerfCountBegin>(node) || ov::is_type<snippets::op::PerfCountEnd>(node) ||
+#endif  // SNIPPETS_DEBUG_CAPS
+            ov::is_type<op::LoopEnd>(node) || ov::is_type<ov::op::v0::Result>(node);
+
+        OPENVINO_ASSERT(expr->get_output_count() == node->get_output_size() || bypass_output_size_check,
+                        "Incorrect count of output port descriptors!");
         expr->validate();
         // Loop expr doesn't have shapes and layouts
         if (!ov::is_type<op::LoopBase>(node))

@@ -3,6 +3,8 @@
 //
 #ifdef SNIPPETS_DEBUG_CAPS
 
+#include <fstream>
+
 #include "snippets/op/perf_count.hpp"
 
 namespace ov {
@@ -62,9 +64,30 @@ void PerfCountBegin::set_start_time() {
 }
 
 //////////////////PerfCountEnd///////////////
-PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
+
+size_t PerfCountEnd::nodes_count = 0;
+std::map<std::string, std::string> PerfCountEnd::m_debug_params_map;
+std::string PerfCountEnd::brgemm_csv_path;  // NOLINT
+
+PerfCountEnd::PerfCountEnd() : PerfCountEndBase() {
+    ++nodes_count;
+}
+
+PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin)
+    : PerfCountEndBase({pc_begin}),
+      accumulation(0ul),
+      iteration(0u) {
     constructor_validate_and_infer_types();
     init_pc_begin();
+    ++nodes_count;
+}
+
+PerfCountEnd::~PerfCountEnd() {
+    output_perf_count();
+    --nodes_count;
+    if (nodes_count == 0) {
+        dump_brgemm_params_to_csv();
+    }
 }
 
 std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
@@ -109,6 +132,37 @@ void PerfCountEnd::output_perf_count() {
     std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
     // max avg
     std::cout << "max avg time:" << avg_max << "ns" << std::endl;
+
+    // Dump brgemm debug parameters to csv file
+    if (acc_max != 0 && avg_max != 0 && get_friendly_name().find("_DebugParams") != std::string::npos) {
+        const auto& rt_info = get_rt_info();
+        auto brgemm_params_it = rt_info.find("brgemm_params");
+        if (brgemm_params_it == rt_info.end()) {
+            return;
+        }
+        if (brgemm_csv_path.empty()) {
+            auto brgemm_csv_path_it = rt_info.find("brgemm_params_csv_path");
+            if (brgemm_csv_path_it != rt_info.end()) {
+                brgemm_csv_path = brgemm_csv_path_it->second.as<std::string>();
+            }
+        }
+        m_debug_params_map[get_friendly_name()] =
+            brgemm_params_it->second.as<std::string>() + std::to_string(acc_max) + ',' + std::to_string(avg_max);
+    }
+}
+
+void PerfCountEnd::dump_brgemm_params_to_csv() {
+    if (m_debug_params_map.empty() || brgemm_csv_path.empty()) {
+        return;
+    }
+    std::ofstream csv_file(brgemm_csv_path);
+    OPENVINO_ASSERT(csv_file.is_open(), "Failed to open csv file for brgemm debug parameters.");
+    csv_file << "name,subgraph_name,in_type,out_type,in_shapes,out_shapes,in_layouts,out_layouts,M,N,K,m_block,n_block,k_block,acc_max_time,"
+                "avg_max_time\n";
+    for (const auto& [_, params] : m_debug_params_map) {
+        csv_file << params << '\n';
+    }
+    csv_file.close();
 }
 
 } // namespace op

@@ -22,6 +22,9 @@ void DebugCapsConfig::readProperties() {
         dumpLIR.parseAndSet(envVarValue);
         OPENVINO_ASSERT(!dumpLIR.passes.empty(), "Passes option in OV_SNIPPETS_DUMP_LIR must be provided.");
     }
+    if ((envVarValue = readEnv("OV_SNIPPETS_DUMP_BRGEMM_PARAMS"))) {
+        dumpParams.parseAndSet(envVarValue);
+    }
 }
 
 void DebugCapsConfig::PropertyGroup::parseAndSet(const std::string& str) {