Merge branch 'master' into issue#27715

amanmogal · Dec 4, 2024 · ab3ecba · ab3ecba
2 parents 8ae7a6c + c69e5d7
commit ab3ecba
Show file tree

Hide file tree

Showing 33 changed files with 546 additions and 61 deletions.
diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml
@@ -212,7 +212,6 @@ jobs:
             tar -cvf - \
               tests/ov_cpu_func_tests \
               tests/libopenvino_template_extension.so \
-              tests/libze_loader.so* \
               tests/libhwloc* \
               tests/libtbb* \
               tests/functional_test_utils/layer_tests_summary/* \

diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml
@@ -237,7 +237,7 @@ jobs:
           Compress-Archive @compress
 
           $compress = @{
-            Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.BUILD_DIR }}/bin/${{ env.CMAKE_BUILD_TYPE }}/ze_loader.dll", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb"
+            Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb"
             CompressionLevel = "Optimal"
             DestinationPath = "${{ env.BUILD_DIR }}/openvino_tests.zip"
           }

diff --git a/docs/articles_en/learn-openvino/llm_inference_guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide.rst
@@ -9,6 +9,7 @@ Generative AI workflow
    :maxdepth: 1
    :hidden:
 
+   Generative Model Preparation <llm_inference_guide/genai-model-preparation>
    Inference with OpenVINO GenAI <llm_inference_guide/genai-guide>
    Inference with Optimum Intel <llm_inference_guide/llm-inference-hf>
    Generative AI with Base OpenVINO (not recommended) <llm_inference_guide/llm-inference-native-ov>

diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst
@@ -0,0 +1,159 @@
+Generative Model Preparation
+===============================================================================
+
+.. meta::
+   :description: Learn how to use Hugging Face Hub and Optimum Intel APIs to
+                 prepare generative models for inference.
+
+
+
+Since generative AI models tend to be big and resource-heavy, it is advisable to store them
+locally and optimize for efficient inference. This article will show how to prepare
+LLM models for inference with OpenVINO by:
+
+* `Downloading Models from Hugging Face <#download-generative-models-from-hugging-face-hub>`__
+* `Downloading Models from Model Scope <#download-generative-models-from-model-scope>`__
+* `Converting and Optimizing Generative Models <#convert-and-optimize-generative-models>`__
+
+
+
+Download Generative Models From Hugging Face Hub
+###############################################################################
+
+Pre-converted and pre-optimized models are available in the `OpenVINO Toolkit <https://huggingface.co/OpenVINO>`__
+organization, under the `model section <https://huggingface.co/OpenVINO#models>`__, or under
+different model collections:
+
+* `LLM: <https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd>`__
+* `Speech-to-Text <https://huggingface.co/collections/OpenVINO/speech-to-text-672321d5c070537a178a8aeb>`__
+* `Speculative Decoding Draft Models <https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161>`__
+
+You can also use the **huggingface_hub** package to download models:
+
+.. code-block:: console
+
+   pip install huggingface_hub
+   huggingface-cli download "OpenVINO/phi-2-fp16-ov" --local-dir model_path
+
+
+The models can be used in OpenVINO immediately after download. No dependencies
+are required except **huggingface_hub**.
+
+
+Download Generative Models From Model Scope
+###############################################################################
+
+To download models from `Model Scope <https://www.modelscope.cn/home>`__,
+use the **modelscope** package:
+
+.. code-block:: console
+
+   pip install modelscope
+   modelscope download --model "Qwen/Qwen2-7b" --local_dir model_path
+
+Models downloaded via Model Scope are available in Pytorch format only and they must
+be :doc:`converted to OpenVINO IR <../../openvino-workflow/model-preparation/convert-model-to-ir>`
+before inference.
+
+Convert and Optimize Generative Models
+###############################################################################
+
+OpenVINO works best with models in the OpenVINO IR format, both in full precision and quantized.
+If your selected model has not been pre-optimized, you can easily do it yourself, using a single
+**optimum-cli** command. For that, make sure optimum-intel is installed on your system:
+
+.. code-block:: console
+
+   pip install optimum-intel[openvino]
+
+
+While optimizing models, you can decide to keep the original precision or select one that is lower.
+
+.. tab-set::
+
+   .. tab-item:: Keeping full model precision
+      :sync: full-precision
+
+      .. code-block:: console
+
+         optimum-cli export openvino --model <model_id> --weight-format fp16 <exported_model_name>
+
+      Examples:
+
+      .. tab-set::
+
+         .. tab-item:: LLM (text generation)
+            :sync: llm-text-gen
+
+            .. code-block:: console
+
+               optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 ov_llama_2
+
+         .. tab-item:: Diffusion models (text2image)
+            :sync: diff-text-img
+
+            .. code-block:: console
+
+               optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format fp16 ov_SDXL
+
+         .. tab-item:: VLM (Image processing):
+            :sync: vlm-img-proc
+
+            .. code-block:: console
+
+               optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code –weight-format fp16 ov_MiniCPM-V-2_6
+
+         .. tab-item:: Whisper models (speech2text):
+            :sync: whisp-speech-txt
+
+            .. code-block:: console
+
+               optimum-cli export openvino --trust-remote-code --model openai/whisper-base ov_whisper
+
+   .. tab-item:: Exporting to selected precision
+      :sync: low-precision
+
+      .. code-block:: console
+
+         optimum-cli export openvino --model <model_id> --weight-format int4 <exported_model_name>
+
+      Examples:
+
+      .. tab-set::
+
+         .. tab-item:: LLM (text generation)
+            :sync: llm-text-gen
+
+            .. code-block:: console
+
+               optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format int4 ov_llama_2
+
+         .. tab-item:: Diffusion models (text2image)
+            :sync: diff-text-img
+
+            .. code-block:: console
+
+               optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format int4 ov_SDXL
+
+         .. tab-item:: VLM (Image processing)
+            :sync: vlm-img-proc
+
+            .. code-block:: console
+
+               optimum-cli export openvino -m model_path --task text-generation-with-past --weight-format int4 ov_MiniCPM-V-2_6
+
+
+.. note::
+
+   Any other ``model_id``, for example ``openbmb/MiniCPM-V-2_6``, or the path
+   to a local model file can be used.
+
+   Also, you can specify different data type like ``int8``.
+
+
+Additional Resources
+###############################################################################
+
+* `Full set of optimum-cli parameters <https://huggingface.co/docs/optimum/en/intel/openvino/export>`__
+* :doc:`Model conversion in OpenVINO <../../openvino-workflow/model-preparation/convert-model-to-ir>`
+* :doc:`Model optimization in OpenVINO <../../openvino-workflow/model-optimization>`
diff --git a/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf
diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt
@@ -10,10 +10,11 @@ pytest-timeout==2.3.1
 # Python bindings
 build<1.3
 pygments>=2.8.1
-setuptools>=65.6.1,<75.3.0
+setuptools>=70.1,<75.6
 sympy>=1.10
 wheel>=0.38.1
 patchelf<=0.17.2.1
+packaging>=22.0
 
 # Frontends
 h5py>=3.1.0,<3.13.0

diff --git a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp
@@ -254,7 +254,7 @@ void CheckpointV1Reader::read_variable(const std::string& variable_name, ov::Any
 
     // This is only present at the first item of each checkpoint file and serves
     // as a table of contents, listing all the tensor slices saved in this file.
-    ::tensorflow::SavedTensorSlices sts;
+    ::tensorflow::SavedTensorSlices sts{};
     FRONT_END_GENERAL_CHECK(sts.ParseFromArray(raw_data.data(), static_cast<int>(raw_data.size())),
                             "[TensorFlow Frontend] incorrect input checkpoint file or internal error: cannot parse "
                             "SavedTensorSlices entry");

diff --git a/src/frontends/tensorflow/src/op/var_handle.cpp b/src/frontends/tensorflow/src/op/var_handle.cpp
@@ -98,7 +98,7 @@ OutputVector translate_varhandle_op(const NodeContext& node) {
 
         TENSORFLOW_OP_VALIDATION(node, result, "[TensorFlow Frontend] Internal error: Cannot find requested variable.");
 
-        ::tensorflow::BundleEntryProto entry;
+        ::tensorflow::BundleEntryProto entry{};
         TENSORFLOW_OP_VALIDATION(node,
                                  entry.ParseFromArray(entry_data, static_cast<int>(entry_size)),
                                  "[TensorFlow Frontend] Internal error: Cannot get read bundle entry.");

diff --git a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp
@@ -111,7 +111,7 @@ OutputVector translate_xla_conv_v2_op(const NodeContext& node) {
                              is_all_one,
                              "[TensorFlow Frontend] internal error: convolutional kernel with holes is not supported");
 
-    ConvolutionDimensionNumbers dimension_numbers;
+    ConvolutionDimensionNumbers dimension_numbers{};
     TENSORFLOW_OP_VALIDATION(
         node,
         dimension_numbers.ParseFromArray(dimension_numbers_message.data(),

diff --git a/src/frontends/tensorflow/src/variables_index.cpp b/src/frontends/tensorflow/src/variables_index.cpp
@@ -128,7 +128,7 @@ void VariablesIndex::read_bundle_header() {
     auto item = m_variables_index.find("");
     FRONT_END_GENERAL_CHECK(item != m_variables_index.end(), "Bundle Header isn't found in index");
 
-    ::tensorflow::BundleHeaderProto bundleHeader;
+    ::tensorflow::BundleHeaderProto bundleHeader{};
     FRONT_END_GENERAL_CHECK(bundleHeader.ParseFromArray(item->second.data(), static_cast<int>(item->second.size())),
                             "Bundle Header: Cannot parse Bundle Header");
     FRONT_END_GENERAL_CHECK(bundleHeader.version().producer() == 1, "Bundle Header: Unsupported producer version");
@@ -147,7 +147,7 @@ void VariablesIndex::read_checkpointable_object_graph() {
         return;
     }
 
-    ::tensorflow::BundleEntryProto entry;
+    ::tensorflow::BundleEntryProto entry{};
     FRONT_END_GENERAL_CHECK(entry.ParseFromArray(item->second.data(), static_cast<int>(item->second.size())),
                             "CMO: Cannot parse Bundle Entry");
 

diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp
@@ -136,11 +136,11 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this<
 
     /**
      * @brief Release intermediate memory
-     * 
+     *
      */
     virtual void release_memory();
 
-    virtual ~ICompiledModel() = default;
+    virtual ~ICompiledModel();
 
 private:
     std::shared_ptr<const ov::IPlugin> m_plugin;

diff --git a/src/inference/src/cpp/compiled_model.cpp b/src/inference/src/cpp/compiled_model.cpp
@@ -8,10 +8,6 @@
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/properties.hpp"
 
-#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
-#    include <malloc.h>
-#endif
-
 #define OV_COMPILED_MODEL_CALL_STATEMENT(...)                 \
     if (_impl == nullptr)                                     \
         OPENVINO_THROW("CompiledModel was not initialized."); \
@@ -27,12 +23,6 @@ namespace ov {
 
 CompiledModel::~CompiledModel() {
     _impl = {};
-#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
-    // Linux memory margent doesn't return system memory immediate after release.
-    // It depends on memory chunk size and allocation history.
-    // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process.
-    malloc_trim(0);
-#endif
 }
 
 CompiledModel::CompiledModel(const std::shared_ptr<ov::ICompiledModel>& impl, const std::shared_ptr<void>& so)

diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp
@@ -10,6 +10,10 @@
 #include "openvino/runtime/properties.hpp"
 #include "transformations/utils/utils.hpp"
 
+#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
+#    include <malloc.h>
+#endif
+
 ov::ICompiledModel::ICompiledModel(const std::shared_ptr<const ov::Model>& model,
                                    const std::shared_ptr<const ov::IPlugin>& plugin,
                                    const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor,
@@ -151,3 +155,12 @@ void ov::ICompiledModel::set_model_shared_object(ov::Model& model, const std::sh
 void ov::ICompiledModel::release_memory() {
     // nothing to do
 }
+
+ov::ICompiledModel::~ICompiledModel() {
+#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
+    // Linux memory margent doesn't return system memory immediate after release.
+    // It depends on memory chunk size and allocation history.
+    // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process.
+    malloc_trim(0);
+#endif
+}
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -461,10 +461,11 @@ void Config::updateProperties() {
 }
 
 void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
-    if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
+    // if user sets explicitly, it will be higher priority than rt_info
+    if (!kvCachePrecisionSetExplicitly && model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
         this->kvCachePrecision = model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
     }
-    if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
+    if (!fcDynamicQuantizationGroupSizeSetExplicitly && model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
         this->fcDynamicQuantizationGroupSize =
             model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
     }

diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -358,4 +358,23 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompil
     ASSERT_EQ(size.as<uint64_t>(), 16);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCoreProperties) {
+    ov::Core core;
+    ov::Any type;
+    ov::Any size;
+
+    core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32));
+    core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(16));
+
+    ov::CompiledModel compiledModel;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+
+    OV_ASSERT_NO_THROW(compiledModel = core.compile_model(model, deviceName));
+    OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f32);
+    ASSERT_EQ(size.as<uint64_t>(), 16);
+}
+
 } // namespace
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -846,6 +846,27 @@ void prepare_buffer_fusing::run(program& p) {
                 if (user_info.first) {
                     node.get_users().front()->set_output_layout(user_info.second);
                 }
+
+                // In case that the rank of weight node of gemm is less than 4 and,
+                // it transforms to extend to 4 dims by adding 1 to begin().
+                // Therefore, the padding of crop_layout should be shifted properly.
+                const size_t TDIM = 4;
+                auto user = node.get_users().front();
+                bool allow_new_shape_infer = node.get_program().is_new_shape_infer();
+                if (!allow_new_shape_infer && user->is_type<gemm>() && user->get_dependency(1).id().compare(node.id()) == 0) {
+                    auto input_rank = user->get_kernel_impl_params()->typed_desc<gemm>()->weight_rank;
+                    if (input_rank < TDIM) {
+                        std::vector<int32_t> l_pad = {0, 0, 0, 0};
+                        std::vector<int32_t> u_pad = {0, 0, 0, 0};
+
+                        //shift right
+                        size_t shift_right = TDIM - input_rank;
+                        std::copy_n(crop_layout.data_padding._lower_size.begin(), l_pad.size() - shift_right, l_pad.begin() + shift_right);
+                        std::copy_n(crop_layout.data_padding._upper_size.begin(), u_pad.size() - shift_right, u_pad.begin() + shift_right);
+
+                        crop_layout.data_padding = padding(l_pad, u_pad);
+                    }
+                }
             }
             node.set_output_layout(crop_layout);
             node.can_be_optimized(true);