Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update openvino-mlir-gc integration #167

Open
wants to merge 22 commits into
base: mlir
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cmake/graph-compiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ if (NOT DEFINED GRAPH_COMPILER_LIBS)
FetchContent_Declare(
GC
GIT_REPOSITORY https://github.com/intel/graph-compiler.git
GIT_TAG main
GIT_TAG xgniu/constant_weights_folding # zhicong/perf_test2 # yifei/mlp_benching_new
FIND_PACKAGE_ARGS NAMES GraphCompiler
)

set(GC_ENABLE_OPT OFF)
set(GC_ENABLE_TEST OFF)
set(GC_ENABLE_DNNL OFF)
set(GC_ENABLE_DNNL_API OFF)
set(GC_ENABLE_LEGACY OFF)
set(GC_ENABLE_BINDINGS_PYTHON OFF)
set(OV_BUILD_SHARED_LIBS_TMP ${BUILD_SHARED_LIBS})
Expand All @@ -31,6 +31,9 @@ if (NOT DEFINED GRAPH_COMPILER_LIBS)
GcInterface
GcJitWrapper
GcCpuRuntime
# For some branches:
# MLIRCPURuntimeTransforms
# MLIRMicrokernelTransforms
)
set_property(GLOBAL PROPERTY GRAPH_COMPILER_LIBS ${GRAPH_COMPILER_LIBS})
endif ()
Expand Down
104 changes: 104 additions & 0 deletions gc.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
diff --git a/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h b/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
index cb2e080..5c3dc9e 100644
--- a/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
+++ b/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
@@ -26,7 +26,7 @@ extern "C" {
* given in dnnl type value.
* Output: A handle of dispatched kernel.
*/
-int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+__attribute__((__visibility__("default"))) int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
int64_t LDB, int64_t LDC, int64_t stride_a,
int64_t stride_b, float beta, int64_t dtypeA,
int64_t dtypeB);
@@ -36,14 +36,14 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
* Inputs: A handle of dispatched kernel.
* Output: None.
*/
-void dnnl_brgemm_tileconfig(int64_t kernel);
+__attribute__((__visibility__("default"))) void dnnl_brgemm_tileconfig(int64_t kernel);

/**
* Release the current AMX tile context.
* Inputs: None.
* Output: None.
*/
-void dnnl_brgemm_tilerelease();
+__attribute__((__visibility__("default"))) void dnnl_brgemm_tilerelease();

/**
* Execute the given kernel with given parameters.
@@ -54,7 +54,7 @@ void dnnl_brgemm_tilerelease();
* num: Batch size of Brgemm.
* Output: None.
*/
-void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
+__attribute__((__visibility__("default"))) void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
uint64_t B_offset, void *C, uint64_t C_offset,
int num);
}
diff --git a/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp b/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
index 7b2790c..36b9c51 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
@@ -239,16 +239,16 @@ static thread_local FILOMemoryPool mainMemoryPool_{mainChunkSize};
// if the current thread is a worker thread, use this pool
static thread_local FILOMemoryPool threadMemoryPool_{threadlocalChunkSize};

-extern "C" void *gcAlignedMalloc(size_t sz) noexcept {
+extern "C" __attribute__((__visibility__("default"))) void *gcAlignedMalloc(size_t sz) noexcept {
return mainMemoryPool_.alloc(sz);
}

-extern "C" void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }
+extern "C" __attribute__((__visibility__("default"))) void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }

-extern "C" void *gcThreadAlignedMalloc(size_t sz) noexcept {
+extern "C" __attribute__((__visibility__("default"))) void *gcThreadAlignedMalloc(size_t sz) noexcept {
return threadMemoryPool_.alloc(sz);
}

-extern "C" void gcThreadAlignedFree(void *p) noexcept {
+extern "C" __attribute__((__visibility__("default"))) void gcThreadAlignedFree(void *p) noexcept {
threadMemoryPool_.dealloc(p);
}
diff --git a/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp b/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
index d7dd2cc..bdf0c4d 100644
--- a/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
+++ b/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
@@ -64,7 +64,7 @@ static thread_local char scratch[SCRATCH_SIZE] = {0};

extern "C" {

-int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+__attribute__((__visibility__("default"))) int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
int64_t LDB, int64_t LDC, int64_t stride_a,
int64_t stride_b, float beta, int64_t dtypeA,
int64_t dtypeB) {
@@ -109,7 +109,7 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
return g_brgemm_desc_list.size() - 1;
}

-void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
+__attribute__((__visibility__("default"))) void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
char *palette_buffer = nullptr;
{
read_lock_guard_t g(g_brgemm_lock);
@@ -126,7 +126,7 @@ void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
amx_tile_configure(palette_buffer);
}

-void dnnl_brgemm_tilerelease() {
+__attribute__((__visibility__("default"))) void dnnl_brgemm_tilerelease() {
if (!mayiuse(avx512_core_amx)) {
return;
}
@@ -134,7 +134,7 @@ void dnnl_brgemm_tilerelease() {
amx_tile_release();
}

-void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
+__attribute__((__visibility__("default"))) void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
void *B, uint64_t B_offset, void *C, uint64_t C_offset,
int num) {
brgemm_kernel_t *kernel = nullptr;
66 changes: 63 additions & 3 deletions src/common/transformations/src/transformations/mlir/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,73 @@ mlir::OwningOpRef<mlir::ModuleOp> ngraph_to_mlir(MLIRContext* context,
// Affix target information attribute to the module to be used, at its discretion,
// by the MLIR-compiler that consumes this module.
auto tileSize = IntegerAttr::get(IntegerType::get(context, 32), 32);
auto key = StringAttr::get(context, "tile_size");
DataLayoutEntryInterface entry = DataLayoutEntryAttr::get(context, key, tileSize);
TargetDeviceSpecInterface deviceSpec = TargetDeviceSpecAttr::get(context, ArrayRef(entry));
auto tileSizeKey = StringAttr::get(context, "tile_size");
DataLayoutEntryInterface tileSizeEntry = DataLayoutEntryAttr::get(context, tileSizeKey, tileSize);

int numThreadsInt = 1;
if (char* numThreadsEnv = std::getenv("OMP_NUM_THREADS")) {
numThreadsInt = std::atoi(numThreadsEnv);
}
auto numThreads = IntegerAttr::get(IntegerType::get(context, 32), numThreadsInt);
auto numThreadsKey = StringAttr::get(context, "num_threads");
DataLayoutEntryInterface numThreadsEntry = DataLayoutEntryAttr::get(context, numThreadsKey, numThreads);

int L1CacheSizeInt = 49152;
if (char* L1CacheSizeEnv = std::getenv("L1_CACHE_SIZE")) {
L1CacheSizeInt = std::atoi(L1CacheSizeEnv);
}
auto L1CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L1CacheSizeInt);
auto L1CacheSizeKey = StringAttr::get(context, "L1_cache_size_in_bytes");
DataLayoutEntryInterface L1CacheSizeEntry = DataLayoutEntryAttr::get(context, L1CacheSizeKey, L1CacheSize);

int L2CacheSizeInt = 2097152;
if (char* L2CacheSizeEnv = std::getenv("L2_CACHE_SIZE")) {
L2CacheSizeInt = std::atoi(L2CacheSizeEnv);
}
auto L2CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L2CacheSizeInt);
auto L2CacheSizeKey = StringAttr::get(context, "L2_cache_size_in_bytes");
DataLayoutEntryInterface L2CacheSizeEntry = DataLayoutEntryAttr::get(context, L2CacheSizeKey, L2CacheSize);

int L3CacheSizeInt = 1966080;
if (char* L3CacheSizeEnv = std::getenv("L3_CACHE_SIZE")) {
L3CacheSizeInt = std::atoi(L3CacheSizeEnv);
}
auto L3CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L3CacheSizeInt);
auto L3CacheSizeKey = StringAttr::get(context, "L3_cache_size_in_bytes");
DataLayoutEntryInterface L3CacheSizeEntry = DataLayoutEntryAttr::get(context, L3CacheSizeKey, L3CacheSize);

int maxVectorWidthInt = 512;
if (char* maxVectorWidthEnv = std::getenv("MAX_VECTOR_WIDTH")) {
maxVectorWidthInt = std::atoi(maxVectorWidthEnv);
}
auto maxVectorWidth = IntegerAttr::get(IntegerType::get(context, 32), maxVectorWidthInt);
auto maxVectorWidthKey = StringAttr::get(context, "max_vector_width");
DataLayoutEntryInterface maxVectorWidthEntry = DataLayoutEntryAttr::get(context, maxVectorWidthKey, maxVectorWidth);

TargetDeviceSpecInterface deviceSpec = TargetDeviceSpecAttr::get(context,
ArrayRef({tileSizeEntry,
numThreadsEntry,
L1CacheSizeEntry,
L2CacheSizeEntry,
L3CacheSizeEntry,
maxVectorWidthEntry}));
auto deviceStr = StringAttr::get(context, "CPU");
auto sysSpec = TargetSystemSpecAttr::get(context, ArrayRef(std::pair(deviceStr, deviceSpec)));
module.getOperation()->setAttr("#dlti.sys_spec", sysSpec);

std::vector<int> compiletime_const_args_index;
for (size_t i = 0; i < inputs.size(); ++i) {
auto parent = inputs[i].get_node_shared_ptr();
if (auto data_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(parent)) {
OPENVINO_MLIR_DEBUG_PRINT("Mark #" << i << " input as Constant tensor\n");
compiletime_const_args_index.push_back(i);
}
}
func.getOperation()->setAttr("compiletime_const_args_index",
moduleBuilder.getI32ArrayAttr(compiletime_const_args_index));

func.getOperation()->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), UnitAttr::get(context));

ConversionContext conversion_context(context, &block_builder);

for (size_t i = 0; i < inputs.size(); ++i) {
Expand Down
139 changes: 139 additions & 0 deletions src/common/transformations/src/transformations/mlir/mlir_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <functional>
#include <memory>
#include <string>
#include <unordered_set>

#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
Expand Down Expand Up @@ -241,6 +242,13 @@ struct MemRefDescriptor {
}
}

MemRefDescriptor(ov::mlir::CachedBuffer buffer)
: allocated(buffer.buffer),
aligned(buffer.buffer),
offset(0),
shape(buffer.shape),
strides(buffer.strides) {}

void* allocated;
void* aligned;
int64_t offset;
Expand All @@ -267,6 +275,100 @@ namespace mlir {

using namespace ::mlir;

static std::unordered_set<const MLIROp *> executed_ops;

void MLIREvaluate::set_folding_info() {
{
auto expectArgs = engine->lookup("__num_orig_args");
if (!expectArgs) {
llvm::consumeError(expectArgs.takeError());
return;
}
folding_info.num_orig_args = *reinterpret_cast<int32_t*>(*expectArgs);
}

{
auto expectFold = engine->lookupPacked(defaultFoldName);
if (!expectFold) {
llvm::consumeError(expectFold.takeError());
return;
}
folding_info.fold_func = *expectFold;
}

{
auto expectBufferIds = engine->lookup("__runtime_fold_buffer_ids");
if (!expectBufferIds) {
llvm::consumeError(expectBufferIds.takeError());
return;
}
auto raw = reinterpret_cast<int64_t*>(*expectBufferIds);
folding_info.fold_buffer_ids = llvm::ArrayRef<int64_t>{raw + 1, raw[0]};
}

{
auto expectFold = engine->lookup("__fold_args");
if (!expectFold) {
llvm::consumeError(expectFold.takeError());
return;
}
auto raw = reinterpret_cast<int32_t*>(*expectFold);
folding_info.fold_args = llvm::ArrayRef<int32_t>{raw + 1, raw[0]};
}

{
auto expect = engine->lookup("__compute_args");
if (!expect) {
llvm::consumeError(expect.takeError());
return;
}
auto raw = reinterpret_cast<int32_t*>(*expect);
folding_info.compute_args = llvm::ArrayRef<int32_t>{raw + 1, raw[0]};
}

{
auto expect = engine->lookup("__folded_ranks");
if (!expect) {
llvm::consumeError(expect.takeError());
return;
}
auto raw = reinterpret_cast<int32_t*>(*expect);
folding_info.folded_ranks = llvm::ArrayRef<int32_t>{raw, folding_info.fold_buffer_ids.size()};
}

{
auto expect = engine->lookup("__folded_shapes");
if (!expect) {
llvm::consumeError(expect.takeError());
return;
}
int32_t size = folding_info.fold_buffer_ids.size(); // element bytes of each buffer
for (auto r : folding_info.folded_ranks) {
size += r;
}
auto raw = reinterpret_cast<int64_t*>(*expect);
llvm::ArrayRef<int64_t> folded_shapes = llvm::ArrayRef<int64_t>{raw, size};
int pos = 0;
for (int i = 0; i < folding_info.folded_ranks.size(); ++i) {
std::vector<int64_t> shape(folded_shapes.begin() + pos,
folded_shapes.begin() + pos + folding_info.folded_ranks[i] + 1);
pos += folding_info.folded_ranks[i] + 1;
folding_info.folded_shapes.push_back(shape);
}
}

for (auto id : folding_info.fold_buffer_ids) {
std::vector<int64_t> shape = folding_info.folded_shapes[id];
size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
shape.pop_back(); // delete the last which is bytes of element
std::vector<int64_t> strides(shape.size(), 1);
for (int i = strides.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * shape[i + 1];
}
void* buffer = std::aligned_alloc(/*alignment*/ 64, size);
cached_const_buffers[id] = CachedBuffer{buffer, shape, strides};
}
}

MLIREvaluate::MLIREvaluate(OwningOpRef<mlir::ModuleOp> _module, MlirMode mode) :
module(std::move(_module)) {
Expand Down Expand Up @@ -302,6 +404,14 @@ MLIREvaluate::MLIREvaluate(OwningOpRef<mlir::ModuleOp> _module, MlirMode mode) :
llvm::errs() << "failed to construct an execution engine\n";
abort();
}

set_folding_info();
}

MLIREvaluate::~MLIREvaluate() {
for (auto pair : cached_const_buffers) {
std::free(pair.second.buffer);
}
}

bool MLIREvaluate::invoke_packed(std::vector<void*>& args) {
Expand Down Expand Up @@ -361,6 +471,35 @@ bool MLIROp::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs)
x.append_to_packed_args(args);
});

if (engine->folding_info.fold_func == nullptr) { // No folding, call entry directly
OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] Call entry func directly\n");
return engine->invoke_packed(args);
}

for (auto id : engine->folding_info.fold_buffer_ids) {
memref_args.push_back(MemRefDescriptor(engine->cached_const_buffers[id]));
}

args.clear();
if (executed_ops.count(this) == 0) { // Call fold
for (auto id : engine->folding_info.fold_args) {
memref_args[id].append_to_packed_args(args);
}
OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] First executon, call fold func\n");
engine->folding_info.fold_func(args.data());

// TODO: Find a better way to check if the op has executed.
// This is a const function and can not modify member attributes directly.
executed_ops.insert(this);
}

// Call entry
args.clear();
OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] Call entry func\n");
for (auto id : engine->folding_info.compute_args) {
memref_args[id].append_to_packed_args(args);
}

//std::cerr << "[ INFO ] Running kernel in MLIROp::evaluate\n";
return engine->invoke_packed(args);
}
Expand Down
Loading
Loading