Skip to content

Commit

Permalink
[CPU] General pattern & matcher & RoPE (with EliminateStridedSlice fi…
Browse files Browse the repository at this point in the history
  • Loading branch information
usstq authored Nov 21, 2023
1 parent a6e4d8f commit 61c5a80
Show file tree
Hide file tree
Showing 14 changed files with 3,276 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ static const TypeToNameMap& get_type_to_name_tbl() {
{ "Unique", Type::Unique},
{ "Ngram", Type::Ngram},
{ "ScaledDotProductAttention", Type::ScaledDotProductAttention},
{ "RoPE", Type::RoPE},
};
return type_to_name_tbl;
}
Expand Down Expand Up @@ -328,6 +329,7 @@ std::string NameFromType(const Type type) {
CASE(Unique);
CASE(Ngram);
CASE(ScaledDotProductAttention);
CASE(RoPE);
CASE(Unknown);
}
#undef CASE
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ enum class Type {
Unique,
Ngram,
ScaledDotProductAttention,
RoPE,
};

enum class Algorithm {
Expand Down
201 changes: 201 additions & 0 deletions src/plugins/intel_cpu/src/nodes/rope.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "rope.h"

#include <chrono>
#include <cpu/x64/cpu_isa_traits.hpp>
#include <ie_ngraph_utils.hpp>
#include <shape_inference/shape_inference_internal_dyn.hpp>
#include <string>
#include <vector>

#include "common/bfloat16.hpp"
#include "common/cpu_memcpy.h"
#include "utils/plain_tensor.hpp"

using namespace InferenceEngine;

namespace ov {
namespace intel_cpu {
namespace node {

RoPE::RoPE(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context)
: Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
OPENVINO_THROW("CPU: " + errorMessage);
}

const auto node = std::dynamic_pointer_cast<const RoPENode>(op);
m_config = node->get_config();
}

template <typename T>
struct RoPE::RoPEExecutorRotateHalf : public RoPE::Executor {
void execute(dnnl::stream strm,
const RoPENode::Config& config,
const std::vector<MemoryPtr>& inputs,
const std::vector<MemoryPtr>& outputs) override {
ov::intel_cpu::PlainTensor<T> t_src(inputs[0]);
ov::intel_cpu::PlainTensor<float> t_cos(inputs[1]);
ov::intel_cpu::PlainTensor<float> t_sin(inputs[2]);
ov::intel_cpu::PlainTensor<T> t_dst(outputs[0]);
ov::intel_cpu::PlainTensor<int32_t> gather;

if (config.slice_stop - config.slice_start > 0) {
t_src = t_src.slice(3, config.slice_start, config.slice_stop);
}
if (config.input_trans0213) {
t_src = t_src.permute({0, 2, 1, 3});
}
if (config.gather_position_arg_id > 0) {
gather.reset(inputs[config.gather_position_arg_id]);
}

auto batch_size = t_src.size(0);
auto head_cnt = t_src.size(1);
auto seq_len = t_src.size(2);
auto feature_size = t_src.size(3);

auto rotary_dims = config.rotary_ndims;
auto half_rotary_dims = rotary_dims / 2;

parallel_for3d(batch_size, head_cnt, seq_len, [&](size_t b, size_t h, size_t p) {
auto cos_pos = p;
if (gather) {
if (gather.m_rank == 4)
cos_pos = gather.at({b, h, p, 0}, true);
else
cos_pos = gather.at({b, p}, true);
}
auto* src = &t_src.at({b, h, p, 0});
auto* cos = &t_cos.at({b, h, cos_pos, 0}, true);
auto* sin = &t_sin.at({b, h, cos_pos, 0}, true);
auto* dst = &t_dst.at({b, h, p, 0});

size_t i = 0;
for (; i < half_rotary_dims; i++) {
dst[i] = cos[i] * src[i] + sin[i] * (-src[i + half_rotary_dims]);
}
for (; i < rotary_dims; i++) {
dst[i] = cos[i] * src[i] + sin[i] * (src[i - half_rotary_dims]);
}
for (; i < feature_size; i++) {
dst[i] = src[i];
}
});
}
};

template <typename T>
struct RoPE::RoPEExecutorInterleaved : public RoPE::Executor {
void execute(dnnl::stream strm,
const RoPENode::Config& config,
const std::vector<MemoryPtr>& inputs,
const std::vector<MemoryPtr>& outputs) override {
ov::intel_cpu::PlainTensor<T> t_src(inputs[0]);
ov::intel_cpu::PlainTensor<float> t_sin_cos(inputs[1]);
ov::intel_cpu::PlainTensor<T> t_dst(outputs[0]);

auto batch_size = t_src.size(0);
auto seq_len = t_src.size(1);
auto head_cnt = t_src.size(2);
auto head_dims = t_src.size(3);

auto rotary_dims = config.rotary_ndims;
auto half_rotary_dims = rotary_dims / 2;
parallel_for3d(batch_size, seq_len, head_cnt, [&](size_t b, size_t p, size_t h) {
auto* x = &t_src.at({b, p, h, 0});
float* sin = &t_sin_cos.at({b, p, 0}, true);
float* cos = &t_sin_cos.at({b, p, half_rotary_dims}, true);
auto* dst = &t_dst.at({b, h, p, 0});

size_t i = 0;
for (size_t j = 0; i < rotary_dims; i += 2, j++) {
dst[i] = cos[j] * x[i] - sin[j] * x[i + 1];
dst[i + 1] = cos[j] * x[i + 1] + sin[j] * x[i];
}
for (; i < head_dims; i++) {
dst[i] = x[i];
}
});
}
};

void RoPE::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
auto srcPrecision = getOriginalInputPrecisionAtPort(0);

auto rtPrecision = srcPrecision;
auto CosSinPrecision = ov::element::f32;

if (m_config.is_interleaved) {
OPENVINO_ASSERT(m_config.input_trans0213 == false);
OPENVINO_ASSERT(m_config.slice_start == 0);
OPENVINO_ASSERT(m_config.slice_stop == 0);
OPENVINO_ASSERT(m_config.gather_position_arg_id == 0);
if (rtPrecision == ov::element::bf16) {
m_executor = std::make_shared<RoPEExecutorInterleaved<ov::bfloat16>>();
} else {
m_executor = std::make_shared<RoPEExecutorInterleaved<float>>();
rtPrecision = ov::element::f32;
}
} else {
if (rtPrecision == ov::element::bf16) {
m_executor = std::make_shared<RoPEExecutorRotateHalf<ov::bfloat16>>();
} else {
m_executor = std::make_shared<RoPEExecutorRotateHalf<float>>();
rtPrecision = ov::element::f32;
}
}

// initialize input ports
std::vector<PortConfigurator> inPortConfigs;
inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1);
inPortConfigs.emplace_back(LayoutType::ncsp, CosSinPrecision, getInputShapeAtPort(1), false, -1);
inPortConfigs.emplace_back(LayoutType::ncsp, CosSinPrecision, getInputShapeAtPort(2), false, -1);
if (m_config.gather_position_arg_id > 0) {
inPortConfigs.emplace_back(LayoutType::ncsp,
ov::element::i32,
getInputShapeAtPort(m_config.gather_position_arg_id),
false,
-1);
}

// initialize output port
std::vector<PortConfigurator> outPortConfigs;
outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1);

addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any);
}

void RoPE::execute(dnnl::stream strm) {
std::vector<MemoryPtr> inputs(getParentEdges().size()), outputs(getChildEdges().size());
for (size_t i = 0; i < inputs.size(); i++) {
inputs[i] = getParentEdgeAt(i)->getMemoryPtr();
}
for (size_t i = 0; i < outputs.size(); i++) {
outputs[i] = getChildEdgeAt(i)->getMemoryPtr();
}
m_executor->execute(strm, m_config, inputs, outputs);
}

bool RoPE::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
const auto node = std::dynamic_pointer_cast<const RoPENode>(op);
if (!node) {
errorMessage = "Only RoPENode operation is supported";
return false;
}
} catch (...) {
return false;
}
return true;
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
54 changes: 54 additions & 0 deletions src/plugins/intel_cpu/src/nodes/rope.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include <ie_common.h>
#include <node.h>

#include <memory>
#include <string>
#include <vector>

#include "transformations/cpu_opset/common/op/rope.hpp"

namespace ov {
namespace intel_cpu {
namespace node {

class RoPE : public Node {
public:
RoPE(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);

void getSupportedDescriptors() override {}
bool created() const override {
return getType() == Type::RoPE;
}
bool needPrepareParams() const override {
return false;
};
void executeDynamicImpl(dnnl::stream strm) override {
execute(strm);
}
void initSupportedPrimitiveDescriptors() override;
void execute(dnnl::stream strm) override;
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;

private:
struct Executor {
virtual void execute(dnnl::stream strm,
const RoPENode::Config& config,
const std::vector<MemoryPtr>& inputs,
const std::vector<MemoryPtr>& outputs) = 0;
};
template <typename T>
struct RoPEExecutorRotateHalf;
template <typename T>
struct RoPEExecutorInterleaved;
RoPENode::Config m_config;
std::shared_ptr<Executor> m_executor;
};

} // namespace node
} // namespace intel_cpu
} // namespace ov
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/nodes_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
#include "nodes/unique.hpp"
#include "nodes/ngram.h"
#include "nodes/scaled_attn.h"
#include "nodes/rope.h"

namespace ov {
namespace intel_cpu {
Expand Down Expand Up @@ -181,6 +182,7 @@ Node::NodesFactory::NodesFactory()
INTEL_CPU_NODE(Eye, Type::Eye);
INTEL_CPU_NODE(Unique, Type::Unique);
INTEL_CPU_NODE(Ngram, Type::Ngram);
INTEL_CPU_NODE(RoPE, Type::RoPE);
INTEL_CPU_NODE(Interpolate, Type::Interpolate);
INTEL_CPU_NODE(RandomUniform, Type::RandomUniform);
INTEL_CPU_NODE(Reduce, Type::Reduce);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "rope.hpp"

#include <algorithm>

#include "transformations/itt.hpp"

ov::intel_cpu::RoPENode::RoPENode(const OutputVector& args, const Config& cfg) : Op(args), m_config(cfg) {
constructor_validate_and_infer_types();
}

std::shared_ptr<ngraph::Node> ov::intel_cpu::RoPENode::clone_with_new_inputs(
const ngraph::OutputVector& new_args) const {
INTERNAL_OP_SCOPE(RoPENode_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<ov::intel_cpu::RoPENode>(new_args, m_config);
}

void ov::intel_cpu::RoPENode::validate_and_infer_types() {
INTERNAL_OP_SCOPE(RoPENode_validate_and_infer_types);
auto input_pshape = get_input_partial_shape(0);
auto input_slice_size = m_config.slice_stop - m_config.slice_start;
if (input_slice_size > 0) {
input_pshape[3] = input_slice_size;
}
if (m_config.input_trans0213) {
// transpose 0213 ([B,L,H,S]=>[B,H,L,S]) happens before RoPE
std::swap(input_pshape[2], input_pshape[1]);
} else if (m_config.is_interleaved) {
// transpose 0213 ([B,L,H,S]=>[B,H,L,S]) happens after RoPE
std::swap(input_pshape[2], input_pshape[1]);
}

set_output_type(0, get_input_element_type(0), input_pshape);
}

bool ov::intel_cpu::RoPENode::visit_attributes(ngraph::AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(RoPENode_visit_attributes);
visitor.start_structure("config");
visitor.on_attribute("slice_start", m_config.slice_start);
visitor.on_attribute("slice_stop", m_config.slice_stop);
visitor.on_attribute("input_trans0213", m_config.input_trans0213);
visitor.on_attribute("is_interleaved", m_config.is_interleaved);
visitor.on_attribute("rotary_ndims", m_config.rotary_ndims);
visitor.on_attribute("gather_position_arg_id", m_config.gather_position_arg_id);
visitor.finish_structure();
return true;
}
Loading

0 comments on commit 61c5a80

Please sign in to comment.