Merge branch 'master' into ImportExport

sbalandi · May 2, 2024 · 45e630e · 45e630e
2 parents 40e198a + 853a48a
commit 45e630e
Show file tree

Hide file tree

Showing 46 changed files with 1,452 additions and 666 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,6 +33,14 @@ if(POLICY CMP0135)
     cmake_policy(SET CMP0135 NEW)
 endif()
 
+if(POLICY CMP0149)
+    # VS generator looks for most recent Windows SDK, ignoring
+    # CMAKE_SYSTEM_VERSION and allowing override by WindowsSDKVersion
+    # environment variable. New in 3.27. This is to allow override
+    # in the Windows CI builds.
+    cmake_policy(SET CMP0149 NEW)
+endif()
+
 project(OpenVINO DESCRIPTION "OpenVINO toolkit")
 
 find_package(OpenVINODeveloperScripts REQUIRED

diff --git a/README.md b/README.md
@@ -100,9 +100,9 @@ Check out the [Awesome OpenVINO](https://github.com/openvinotoolkit/awesome-open
 
 ## Documentation
 
-[**User documentation**](https://docs.openvino.ai/) contains detailed information about OpenVINO and guides you from installation through optimizing and deploying models for your AI applications.
+[User documentation](https://docs.openvino.ai/) contains detailed information about OpenVINO and guides you from installation through optimizing and deploying models for your AI applications.
 
-[**Developer documentation**](./docs/dev/index.md) focuses on how OpenVINO [components](./docs/dev/index.md#openvino-components) work and describes [building](./docs/dev/build.md)  and [contributing](./CONTRIBUTING.md) processes.
+[Developer documentation](./docs/dev/index.md) focuses on how OpenVINO [components](./docs/dev/index.md#openvino-components) work and describes [building](./docs/dev/build.md)  and [contributing](./CONTRIBUTING.md) processes.
 
 ## Contribution and Support
 

diff --git a/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake b/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake
@@ -223,6 +223,8 @@ set(CMAKE_POLICY_DEFAULT_CMP0111 NEW)
 set(CMAKE_POLICY_DEFAULT_CMP0127 NEW)
 # CMake 3.24+ :prefers to set the timestamps of all extracted contents to the time of the extraction
 set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
+# CMake 3.27+ :Visual Studio Generators select latest Windows SDK by default.
+set(CMAKE_POLICY_DEFAULT_CMP0149 NEW)
 
 set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "Don't warn about obsolete cmake versions in 3rdparty")
 set(CMAKE_WARN_ON_ABSOLUTE_INSTALL_DESTINATION ON CACHE BOOL "Warn about absolute paths in destination")

diff --git a/docs/dev/index.md b/docs/dev/index.md
@@ -32,8 +32,9 @@ flowchart LR
         auto(["AUTO"])
         cpu(["Intel_CPU"])
         gpu(["Intel_GPU"])
+        npu(["Intel_NPU"])
         classDef daisy3 fill:#EDB200, stroke: #C98F00, color: #262626
-        class auto,cpu,gpu daisy3
+        class auto,cpu,gpu,npu daisy3
     end
     subgraph frontends [OV Frontends]
         ir_fe["IR Frontend"]
@@ -96,20 +97,22 @@ The OpenVINO Repository includes the following components. Click on the componen
  </code>
 </pre>
 
+src\core\README.md
+
 ### OpenVINO Components
 
 OpenVINO Components include:
 
   * [OpenVINO™ Runtime](https://docs.openvino.ai/2024/openvino-workflow/running-inference.html) - is a set of C++ libraries with C and Python bindings providing a common API to deliver inference solutions on the platform of your choice.
-    * [core](./src/core) - provides the base API for model representation and modification.
-    * [inference](./src/inference) - provides an API to infer models on the device.
-    * [transformations](./src/common/transformations) - contains the set of common transformations which are used in OpenVINO plugins.
-    * [low precision transformations](./src/common/low_precision_transformations) - contains the set of transformations that are used in low precision models
-    * [bindings](./src/bindings) - contains all available OpenVINO bindings which are maintained by the OpenVINO team.
-        * [c](./src/bindings/c) - C API for OpenVINO™ Runtime
-        * [python](./src/bindings/python) - Python API for OpenVINO™ Runtime
-* [Plugins](./src/plugins) - contains OpenVINO plugins which are maintained in open-source by the OpenVINO team. For more information, take a look at the [list of supported devices](#supported-hardware-matrix).
-* [Frontends](./src/frontends) - contains available OpenVINO frontends that allow reading models from the native framework format.
+    * [core](../../src/core) - provides the base API for model representation and modification.
+    * [inference](../../src/inference) - provides an API to infer models on the device.
+    * [transformations](../../src/common/transformations) - contains the set of common transformations which are used in OpenVINO plugins.
+    * [low precision transformations](../../src/common/low_precision_transformations) - contains the set of transformations that are used in low precision models
+    * [bindings](../../src/bindings) - contains all available OpenVINO bindings which are maintained by the OpenVINO team.
+        * [c](../../src/bindings/c) - C API for OpenVINO™ Runtime
+        * [python](../../src/bindings/python) - Python API for OpenVINO™ Runtime
+* [Plugins](../../src/plugins) - contains OpenVINO plugins which are maintained in open-source by the OpenVINO team. For more information, take a look at the [list of supported devices](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html).
+* [Frontends](../../src/frontends) - contains available OpenVINO frontends that allow reading models from the native framework format.
 * [OpenVINO Model Converter (OVC)](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html) - is a cross-platform command-line tool that facilitates the transition between training and deployment environments, and adjusts deep learning models for optimal execution on end-point target devices.
 * [Samples](https://github.com/openvinotoolkit/openvino/tree/master/samples) - applications in C, C++ and Python languages that show basic OpenVINO use cases.
 

diff --git a/src/bindings/python/src/openvino/_offline_transformations/__init__.py b/src/bindings/python/src/openvino/_offline_transformations/__init__.py
@@ -17,3 +17,4 @@
 from openvino._pyopenvino._offline_transformations import compress_model_transformation
 from openvino._pyopenvino._offline_transformations import compress_quantize_weights_transformation
 from openvino._pyopenvino._offline_transformations import convert_sequence_to_tensor_iterator_transformation
+from openvino._pyopenvino._offline_transformations import paged_attention_transformation
diff --git a/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp b/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp
@@ -8,6 +8,7 @@
 
 #include <compress_quantize_weights.hpp>
 #include <openvino/pass/make_stateful.hpp>
+#include <openvino/pass/sdpa_to_paged_attention.hpp>
 #include <openvino/pass/serialize.hpp>
 #include <pruning.hpp>
 #include <transformations/common_optimizations/compress_float_constants.hpp>
@@ -127,4 +128,13 @@ void regmodule_offline_transformations(py::module m) {
             manager.run_passes(model);
         },
         py::arg("model"));
+
+    m_offline_transformations.def(
+        "paged_attention_transformation",
+        [](std::shared_ptr<ov::Model> model) {
+            ov::pass::Manager manager;
+            manager.register_pass<ov::pass::SDPAToPagedAttention>();
+            manager.run_passes(model);
+        },
+        py::arg("model"));
 }
diff --git a/src/bindings/python/src/pyopenvino/graph/ops/paged_attention_extension.cpp b/src/bindings/python/src/pyopenvino/graph/ops/paged_attention_extension.cpp
@@ -5,162 +5,13 @@
 #include "pyopenvino/graph/ops/paged_attention_extension.hpp"
 
 #include "openvino/op/op.hpp"
+#include "openvino/op/paged_attention.hpp"
 #include "pyopenvino/core/common.hpp"
 
 namespace py = pybind11;
 
-namespace {
-
-// This is an experimental operation that is implemented in the plugins.
-// Do not use in user applications, backward compatibility is not guaranteed in future releases.
-class PagedAttentionExtension : public ov::op::Op {
-public:
-    OPENVINO_OP("PagedAttentionExtension");
-
-    PagedAttentionExtension(const ov::OutputVector& args) : ov::op::Op(args) {
-        constructor_validate_and_infer_types();
-    }
-
-    void validate_and_infer_types() override {
-        auto value_cache_shape = get_input_partial_shape(4);
-        // m_num_kv_heads = value_cache_shape[1];
-        // m_head_size = value_cache_shape[2];
-        // m_block_size = value_cache_shape[3];
-        NODE_VALIDATION_CHECK(this, value_cache_shape.size() == 4, "Value cache shape must be 4 dims");
-
-        // key_cache: shape [num_blocks, num_kv_heads, head_size/x, block_size, x]
-        auto key_cache_shape = get_input_partial_shape(3);
-        NODE_VALIDATION_CHECK(this,
-                              value_cache_shape.size() == 4,
-                              // value_cache_shape[0] == key_cache_shape[0] && // num_blocks
-                              // key_cache_shape[1] == m_num_kv_heads &&
-                              // key_cache_shape[2] * key_cache_shape[4] == m_head_size &&
-                              // m_block_size == key_cache_shape[3], // block_size,
-                              "Key cache shape must be 4 dims");
-
-        // query: shape [batch_size, seq_len, num_heads * head_size]
-        auto query_type = get_input_element_type(0);
-        auto query_shape = get_input_partial_shape(0);
-        NODE_VALIDATION_CHECK(
-            this,
-            // query_type.is_real() &&
-            query_shape.size() == 3,
-            // query_shape[2] == m_num_heads * m_head_size,
-            "Query type must be real, shape must be like [batch_size, seq_len, num_heads * head_size]. ",
-            "Got element type ",
-            query_type,
-            ", shape ",
-            query_shape);
-
-        // key: shape [batch_size, seq_len, num_kv_heads * head_size]
-        auto key_type = get_input_element_type(1);
-        auto key_shape = get_input_partial_shape(1);
-        NODE_VALIDATION_CHECK(this,
-                              // query_type == key_type &&
-                              key_shape.size() == 3,
-                              "Key type must be the same as query, shape must be the same as query. "
-                              "Got element type ",
-                              key_type,
-                              ", shape ",
-                              key_shape);
-
-        // value: shape [batch_size, seq_len, num_kv_heads * head_size]
-        // auto value_type = get_input_element_type(2);
-        auto value_shape = get_input_partial_shape(2);
-
-        // is_prompt: boolean scalar
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(5) == ov::element::boolean &&
-                              get_input_shape(5) == ov::Shape({}),
-                              "is_prompt validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(5),
-                              ", shape ",
-                              get_input_shape(5));
-
-        // slot_mapping: shape [batch_size, max_context_len]
-        auto slot_mapping_shape = get_input_partial_shape(6);
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(6) == ov::element::i64 &&
-                              slot_mapping_shape.size() == 2,
-                              "slot_mapping validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(6),
-                              ", shape ",
-                              slot_mapping_shape);
-
-        // max_context_len: integer scalar
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(7) == ov::element::i32 &&
-                              get_input_shape(7) == ov::Shape({}),
-                              "max_context_len validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(7),
-                              ", shape ",
-                              get_input_shape(7));
-
-        // context_lens: shape [batch_size]
-        auto context_lens_shape = get_input_partial_shape(8);
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(8) == ov::element::i32 &&
-                              context_lens_shape.size() == 1,
-                              "context_lens validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(8),
-                              ", shape ",
-                              context_lens_shape);
-
-        // block_tables: shape [batch_size, max_block_per_request]
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(9) == ov::element::i32 &&
-                              get_input_partial_shape(9).size() == 2,
-                              "block_tables validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(9),
-                              ", shape ",
-                              get_input_partial_shape(9));
-
-        // scale: float scalar
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(10) == ov::element::f32 &&
-                              get_input_shape(10) == ov::Shape({}),
-                              "block_tables validation failed. ",
-                              "Got element type ",
-                              get_input_element_type(10),
-                              ", shape ",
-                              get_input_shape(10));
-
-        // alibi_slopes: 1D float tensor
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(11) == ov::element::f32 &&
-                              get_input_partial_shape(11).rank().get_length() == 1,
-                              "alibi_slopes should be a 1D float tensor. ",
-                              "Got element type ",
-                              get_input_element_type(11),
-                              ", shape ",
-                              get_input_partial_shape(11));
-
-        // sliding_window: int scalar
-        NODE_VALIDATION_CHECK(this,
-                              // get_input_element_type(12) == ov::element::i32 &&
-                              get_input_partial_shape(12).rank().get_length() == 0,
-                              "sliding_window argument should be an i32 scalar. ",
-                              "Got element type ",
-                              get_input_element_type(12),
-                              ", shape ",
-                              get_input_partial_shape(12));
-
-        set_output_type(0, query_type, query_shape);
-    }
-
-    std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override {
-        return std::make_shared<PagedAttentionExtension>(new_args);
-    }
-};
-
-}  // namespace
-
 void regclass_graph_op_PagedAttentionExtension(py::module m) {
+    using ov::op::PagedAttentionExtension;
     py::class_<PagedAttentionExtension, std::shared_ptr<PagedAttentionExtension>, ov::Node> cls(
         m,
         "_PagedAttentionExtension");

diff --git a/src/common/snippets/include/snippets/kernel_executor_table.hpp b/src/common/snippets/include/snippets/kernel_executor_table.hpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/expression.hpp"
+
+namespace ov {
+namespace snippets {
+
+/**
+* @brief Base class for all kernel executors. This class should not be instantiated directly.
+ * Derive from KernelExecutor<> to create desired executor implementations.
+*/
+class KernelExecutorBase {
+public:
+    class GenericConfig {
+    public:
+        /**
+        * @brief Returns true if the config specifies all the parameters necessary for kernel compilation.
+         * Configs for static kernels should be completed on code emission stage,
+         * while dynamic kernels will be completed only in runtime, when all the shapes are known.
+        */
+        virtual bool is_completed() const = 0;
+        virtual ~GenericConfig() = default;
+    };
+    virtual ~KernelExecutorBase() = default;
+
+private:
+    KernelExecutorBase() = default;
+    template<typename Conf, typename KernelType,
+            typename std::enable_if<std::is_base_of<GenericConfig, Conf>::value, bool>::type> friend class KernelExecutor;
+};
+
+template<typename Conf, typename KernelType,
+         typename std::enable_if<std::is_base_of<KernelExecutorBase::GenericConfig, Conf>::value, bool>::type = true>
+class KernelExecutor : public snippets::KernelExecutorBase {
+public:
+    explicit KernelExecutor(std::shared_ptr<Conf> c) : KernelExecutorBase(), m_config{std::move(c)} {}
+    /**
+    * @brief check current config and recompile kernel if necessary. Use kernel caching to avoid redundant recompilations.
+     * This method must be called only for complete configs. It's the user responsibility to check is_completed() before calling.
+    */
+    virtual void update_kernel()  = 0;
+protected:
+    /**
+    * @brief Takes shared_ptr to compilation config, returns shared_ptr to compiled kernel.
+     * Should be called only if actual compilation is required. Kernel caching must be implemented in update_kernel().
+    */
+    virtual std::shared_ptr<KernelType> compile_kernel(const std::shared_ptr<Conf>& c) const = 0;
+    /** Contains all the necessary information to compile a desired kernel*/
+    std::shared_ptr<Conf> m_config = nullptr;
+    /** Stores pointer to compiled kernel since the last update_kernel() call */
+    std::shared_ptr<KernelType> m_kernel = nullptr;
+};
+
+class KernelExecutorTable {
+public:
+    template<typename T, class ...C,
+            typename std::enable_if<std::is_base_of<KernelExecutorBase, T>::value, bool>::type = true>
+    std::shared_ptr<T> register_kernel(const snippets::lowered::ExpressionPtr& expr, C... args) {
+        OPENVINO_ASSERT(!m_table.count(expr), "This expression already has an alterable kernel");
+        const auto& instance = std::make_shared<T>(args...);
+        m_table[expr] = instance;
+        return instance;
+    }
+    std::shared_ptr<KernelExecutorBase> get_kernel_executor(const snippets::lowered::ExpressionPtr& expr) const {
+        OPENVINO_ASSERT(m_table.count(expr), "This expression doesn't have a registered kernel executor");
+        return m_table.at(expr);
+    }
+    virtual ~KernelExecutorTable() = default;
+
+protected:
+    std::unordered_map<snippets::lowered::ExpressionPtr, std::shared_ptr<KernelExecutorBase>> m_table{};
+};
+
+using KernelExecutorTablePtr = std::shared_ptr<KernelExecutorTable>;
+
+
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "pass.hpp"
-#include "snippets/lowered/linear_ir.hpp"
 
 namespace ov {
 namespace snippets {

diff --git a/src/common/snippets/include/snippets/target_machine.hpp b/src/common/snippets/include/snippets/target_machine.hpp
@@ -10,6 +10,7 @@
 
 #include "emitter.hpp"
 #include "snippets/lowered/expression.hpp"
+#include "kernel_executor_table.hpp"
 
 namespace ov {
 namespace snippets {
@@ -62,10 +63,12 @@ class TargetMachine {
      * @return true, if supported
      */
     bool has(const ov::DiscreteTypeInfo& type) const;
+    virtual std::shared_ptr<TargetMachine> clone() const = 0;
     virtual ~TargetMachine() = default;
 
 protected:
     std::map<const ov::DiscreteTypeInfo, jitters_value> jitters;
+    std::shared_ptr<KernelExecutorTable> kernel_executor_table;
 };
 
 } // namespace snippets

diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -37,6 +37,7 @@ class DummyTargetMachine : public ov::snippets::TargetMachine {
     bool is_supported() const override { return true; }
     ov::snippets::CompiledSnippetPtr get_snippet() override { return std::make_shared<DummyCompiledSnippet>(); }
     size_t get_lanes() const override { return 10; }
+    std::shared_ptr<TargetMachine> clone() const override { return std::make_shared<DummyTargetMachine>(); }
 };
 
 class DummyGenerator : public ov::snippets::Generator {