slyalin · niuxiaog · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 23, 2024
diff --git a/cmake/graph-compiler.cmake b/cmake/graph-compiler.cmake
@@ -12,13 +12,13 @@ if (NOT DEFINED GRAPH_COMPILER_LIBS)
         FetchContent_Declare(
                 GC
                 GIT_REPOSITORY https://github.com/intel/graph-compiler.git
-                GIT_TAG main
+                GIT_TAG xgniu/constant_weights_folding # zhicong/perf_test2 # yifei/mlp_benching_new
                 FIND_PACKAGE_ARGS NAMES GraphCompiler
         )
 
         set(GC_ENABLE_OPT OFF)
         set(GC_ENABLE_TEST OFF)
-        set(GC_ENABLE_DNNL OFF)
+        set(GC_ENABLE_DNNL_API OFF)
         set(GC_ENABLE_LEGACY OFF)
         set(GC_ENABLE_BINDINGS_PYTHON OFF)
         set(OV_BUILD_SHARED_LIBS_TMP ${BUILD_SHARED_LIBS})
@@ -31,6 +31,9 @@ if (NOT DEFINED GRAPH_COMPILER_LIBS)
             GcInterface
             GcJitWrapper
             GcCpuRuntime
+            # For some branches:
+            # MLIRCPURuntimeTransforms
+            # MLIRMicrokernelTransforms
     )
     set_property(GLOBAL PROPERTY GRAPH_COMPILER_LIBS ${GRAPH_COMPILER_LIBS})
 endif ()

diff --git a/gc.diff b/gc.diff
@@ -0,0 +1,104 @@
+diff --git a/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h b/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
+index cb2e080..5c3dc9e 100644
+--- a/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
++++ b/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
+@@ -26,7 +26,7 @@ extern "C" {
+  * 	                given in dnnl type value.
+  * Output: A handle of dispatched kernel.
+  */
+-int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
++__attribute__((__visibility__("default"))) int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+                              int64_t LDB, int64_t LDC, int64_t stride_a,
+                              int64_t stride_b, float beta, int64_t dtypeA,
+                              int64_t dtypeB);
+@@ -36,14 +36,14 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+  * Inputs: A handle of dispatched kernel.
+  * Output: None.
+  */
+-void dnnl_brgemm_tileconfig(int64_t kernel);
++__attribute__((__visibility__("default"))) void dnnl_brgemm_tileconfig(int64_t kernel);
+
+ /**
+  * Release the current AMX tile context.
+  * Inputs: None.
+  * Output: None.
+  */
+-void dnnl_brgemm_tilerelease();
++__attribute__((__visibility__("default"))) void dnnl_brgemm_tilerelease();
+
+ /**
+  * Execute the given kernel with given parameters.
+@@ -54,7 +54,7 @@ void dnnl_brgemm_tilerelease();
+  * 	num: Batch size of Brgemm.
+  * Output: None.
+  */
+-void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
++__attribute__((__visibility__("default"))) void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
+                          uint64_t B_offset, void *C, uint64_t C_offset,
+                          int num);
+ }
+diff --git a/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp b/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
+index 7b2790c..36b9c51 100644
+--- a/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
++++ b/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
+@@ -239,16 +239,16 @@ static thread_local FILOMemoryPool mainMemoryPool_{mainChunkSize};
+ // if the current thread is a worker thread, use this pool
+ static thread_local FILOMemoryPool threadMemoryPool_{threadlocalChunkSize};
+
+-extern "C" void *gcAlignedMalloc(size_t sz) noexcept {
++extern "C" __attribute__((__visibility__("default"))) void *gcAlignedMalloc(size_t sz) noexcept {
+   return mainMemoryPool_.alloc(sz);
+ }
+
+-extern "C" void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }
++extern "C" __attribute__((__visibility__("default"))) void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }
+
+-extern "C" void *gcThreadAlignedMalloc(size_t sz) noexcept {
++extern "C" __attribute__((__visibility__("default"))) void *gcThreadAlignedMalloc(size_t sz) noexcept {
+   return threadMemoryPool_.alloc(sz);
+ }
+
+-extern "C" void gcThreadAlignedFree(void *p) noexcept {
++extern "C" __attribute__((__visibility__("default"))) void gcThreadAlignedFree(void *p) noexcept {
+   threadMemoryPool_.dealloc(p);
+ }
+diff --git a/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp b/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
+index d7dd2cc..bdf0c4d 100644
+--- a/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
++++ b/lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
+@@ -64,7 +64,7 @@ static thread_local char scratch[SCRATCH_SIZE] = {0};
+
+ extern "C" {
+
+-int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
++__attribute__((__visibility__("default"))) int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+                              int64_t LDB, int64_t LDC, int64_t stride_a,
+                              int64_t stride_b, float beta, int64_t dtypeA,
+                              int64_t dtypeB) {
+@@ -109,7 +109,7 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
+   return g_brgemm_desc_list.size() - 1;
+ }
+
+-void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
++__attribute__((__visibility__("default"))) void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
+   char *palette_buffer = nullptr;
+   {
+     read_lock_guard_t g(g_brgemm_lock);
+@@ -126,7 +126,7 @@ void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
+   amx_tile_configure(palette_buffer);
+ }
+
+-void dnnl_brgemm_tilerelease() {
++__attribute__((__visibility__("default"))) void dnnl_brgemm_tilerelease() {
+   if (!mayiuse(avx512_core_amx)) {
+     return;
+   }
+@@ -134,7 +134,7 @@ void dnnl_brgemm_tilerelease() {
+   amx_tile_release();
+ }
+
+-void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
++__attribute__((__visibility__("default"))) void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
+                          void *B, uint64_t B_offset, void *C, uint64_t C_offset,
+                          int num) {
+   brgemm_kernel_t *kernel = nullptr;
diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp
@@ -137,13 +137,73 @@ mlir::OwningOpRef<mlir::ModuleOp> ngraph_to_mlir(MLIRContext* context,
     // Affix target information attribute to the module to be used, at its discretion,
     // by the MLIR-compiler that consumes this module.
     auto tileSize = IntegerAttr::get(IntegerType::get(context, 32), 32);
-    auto key = StringAttr::get(context, "tile_size");
-    DataLayoutEntryInterface entry = DataLayoutEntryAttr::get(context, key, tileSize);
-    TargetDeviceSpecInterface deviceSpec = TargetDeviceSpecAttr::get(context, ArrayRef(entry));
+    auto tileSizeKey = StringAttr::get(context, "tile_size");
+    DataLayoutEntryInterface tileSizeEntry = DataLayoutEntryAttr::get(context, tileSizeKey, tileSize);
+
+    int numThreadsInt = 1;
+    if (char* numThreadsEnv = std::getenv("OMP_NUM_THREADS")) {
+        numThreadsInt = std::atoi(numThreadsEnv);
+    }
+    auto numThreads = IntegerAttr::get(IntegerType::get(context, 32), numThreadsInt);
+    auto numThreadsKey = StringAttr::get(context, "num_threads");
+    DataLayoutEntryInterface numThreadsEntry = DataLayoutEntryAttr::get(context, numThreadsKey, numThreads);
+
+    int L1CacheSizeInt = 49152;
+    if (char* L1CacheSizeEnv = std::getenv("L1_CACHE_SIZE")) {
+        L1CacheSizeInt = std::atoi(L1CacheSizeEnv);
+    }
+    auto L1CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L1CacheSizeInt);
+    auto L1CacheSizeKey = StringAttr::get(context, "L1_cache_size_in_bytes");
+    DataLayoutEntryInterface L1CacheSizeEntry = DataLayoutEntryAttr::get(context, L1CacheSizeKey, L1CacheSize);
+
+    int L2CacheSizeInt = 2097152;
+    if (char* L2CacheSizeEnv = std::getenv("L2_CACHE_SIZE")) {
+        L2CacheSizeInt = std::atoi(L2CacheSizeEnv);
+    }
+    auto L2CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L2CacheSizeInt);
+    auto L2CacheSizeKey = StringAttr::get(context, "L2_cache_size_in_bytes");
+    DataLayoutEntryInterface L2CacheSizeEntry = DataLayoutEntryAttr::get(context, L2CacheSizeKey, L2CacheSize);
+
+    int L3CacheSizeInt = 1966080;
+    if (char* L3CacheSizeEnv = std::getenv("L3_CACHE_SIZE")) {
+        L3CacheSizeInt = std::atoi(L3CacheSizeEnv);
+    }
+    auto L3CacheSize = IntegerAttr::get(IntegerType::get(context, 32), L3CacheSizeInt);
+    auto L3CacheSizeKey = StringAttr::get(context, "L3_cache_size_in_bytes");
+    DataLayoutEntryInterface L3CacheSizeEntry = DataLayoutEntryAttr::get(context, L3CacheSizeKey, L3CacheSize);
+
+    int maxVectorWidthInt = 512;
+    if (char* maxVectorWidthEnv = std::getenv("MAX_VECTOR_WIDTH")) {
+        maxVectorWidthInt = std::atoi(maxVectorWidthEnv);
+    }
+    auto maxVectorWidth = IntegerAttr::get(IntegerType::get(context, 32), maxVectorWidthInt);
+    auto maxVectorWidthKey = StringAttr::get(context, "max_vector_width");
+    DataLayoutEntryInterface maxVectorWidthEntry = DataLayoutEntryAttr::get(context, maxVectorWidthKey, maxVectorWidth);
+
+    TargetDeviceSpecInterface deviceSpec = TargetDeviceSpecAttr::get(context,
+                                                                     ArrayRef({tileSizeEntry,
+                                                                               numThreadsEntry,
+                                                                               L1CacheSizeEntry,
+                                                                               L2CacheSizeEntry,
+                                                                               L3CacheSizeEntry,
+                                                                               maxVectorWidthEntry}));
     auto deviceStr = StringAttr::get(context, "CPU");
     auto sysSpec = TargetSystemSpecAttr::get(context, ArrayRef(std::pair(deviceStr, deviceSpec)));
     module.getOperation()->setAttr("#dlti.sys_spec", sysSpec);
 
+    std::vector<int> compiletime_const_args_index;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        auto parent = inputs[i].get_node_shared_ptr();
+        if (auto data_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(parent)) {
+            OPENVINO_MLIR_DEBUG_PRINT("Mark #" << i << " input as Constant tensor\n");
+            compiletime_const_args_index.push_back(i);
+        }
+    }
+    func.getOperation()->setAttr("compiletime_const_args_index",
+                                 moduleBuilder.getI32ArrayAttr(compiletime_const_args_index));
+
+    func.getOperation()->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), UnitAttr::get(context));
+
     ConversionContext conversion_context(context, &block_builder);
 
     for (size_t i = 0; i < inputs.size(); ++i) {

diff --git a/src/common/transformations/src/transformations/mlir/mlir_op.cpp b/src/common/transformations/src/transformations/mlir/mlir_op.cpp
@@ -9,6 +9,7 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <unordered_set>
 
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
@@ -241,6 +242,13 @@ struct MemRefDescriptor {
         }
     }
 
+    MemRefDescriptor(ov::mlir::CachedBuffer buffer)
+        : allocated(buffer.buffer),
+          aligned(buffer.buffer),
+          offset(0),
+          shape(buffer.shape),
+          strides(buffer.strides) {}
+
     void* allocated;
     void* aligned;
     int64_t offset;
@@ -267,6 +275,100 @@ namespace mlir {
 
 using namespace ::mlir;
 
+static std::unordered_set<const MLIROp *> executed_ops;
+
+void MLIREvaluate::set_folding_info() {
+    {
+        auto expectArgs = engine->lookup("__num_orig_args");
+        if (!expectArgs) {
+            llvm::consumeError(expectArgs.takeError());
+            return;
+        }
+        folding_info.num_orig_args = *reinterpret_cast<int32_t*>(*expectArgs);
+    }
+
+    {
+        auto expectFold = engine->lookupPacked(defaultFoldName);
+        if (!expectFold) {
+            llvm::consumeError(expectFold.takeError());
+            return;
+        }
+        folding_info.fold_func = *expectFold;
+    }
+
+    {
+        auto expectBufferIds = engine->lookup("__runtime_fold_buffer_ids");
+        if (!expectBufferIds) {
+            llvm::consumeError(expectBufferIds.takeError());
+            return;
+        }
+        auto raw = reinterpret_cast<int64_t*>(*expectBufferIds);
+        folding_info.fold_buffer_ids = llvm::ArrayRef<int64_t>{raw + 1, raw[0]};
+    }
+
+    {
+        auto expectFold = engine->lookup("__fold_args");
+        if (!expectFold) {
+            llvm::consumeError(expectFold.takeError());
+            return;
+        }
+        auto raw = reinterpret_cast<int32_t*>(*expectFold);
+        folding_info.fold_args = llvm::ArrayRef<int32_t>{raw + 1, raw[0]};
+    }
+
+    {
+        auto expect = engine->lookup("__compute_args");
+        if (!expect) {
+            llvm::consumeError(expect.takeError());
+            return;
+        }
+        auto raw = reinterpret_cast<int32_t*>(*expect);
+        folding_info.compute_args = llvm::ArrayRef<int32_t>{raw + 1, raw[0]};
+    }
+
+    {
+        auto expect = engine->lookup("__folded_ranks");
+        if (!expect) {
+            llvm::consumeError(expect.takeError());
+            return;
+        }
+        auto raw = reinterpret_cast<int32_t*>(*expect);
+        folding_info.folded_ranks = llvm::ArrayRef<int32_t>{raw, folding_info.fold_buffer_ids.size()};
+    }
+
+    {
+        auto expect = engine->lookup("__folded_shapes");
+        if (!expect) {
+            llvm::consumeError(expect.takeError());
+            return;
+        }
+        int32_t size = folding_info.fold_buffer_ids.size();  // element bytes of each buffer
+        for (auto r : folding_info.folded_ranks) {
+            size += r;
+        }
+        auto raw = reinterpret_cast<int64_t*>(*expect);
+        llvm::ArrayRef<int64_t> folded_shapes = llvm::ArrayRef<int64_t>{raw, size};
+        int pos = 0;
+        for (int i = 0; i < folding_info.folded_ranks.size(); ++i) {
+            std::vector<int64_t> shape(folded_shapes.begin() + pos,
+                                       folded_shapes.begin() + pos + folding_info.folded_ranks[i] + 1);
+            pos += folding_info.folded_ranks[i] + 1;
+            folding_info.folded_shapes.push_back(shape);
+        }
+    }
+
+    for (auto id : folding_info.fold_buffer_ids) {
+        std::vector<int64_t> shape = folding_info.folded_shapes[id];
+        size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+        shape.pop_back();  // delete the last which is bytes of element
+        std::vector<int64_t> strides(shape.size(), 1);
+        for (int i = strides.size() - 2; i >= 0; --i) {
+            strides[i] = strides[i + 1] * shape[i + 1];
+        }
+        void* buffer = std::aligned_alloc(/*alignment*/ 64, size);
+        cached_const_buffers[id] = CachedBuffer{buffer, shape, strides};
+    }
+}
 
 MLIREvaluate::MLIREvaluate(OwningOpRef<mlir::ModuleOp> _module, MlirMode mode) :
     module(std::move(_module)) {
@@ -302,6 +404,14 @@ MLIREvaluate::MLIREvaluate(OwningOpRef<mlir::ModuleOp> _module, MlirMode mode) :
         llvm::errs() << "failed to construct an execution engine\n";
         abort();
     }
+
+    set_folding_info();
+}
+
+MLIREvaluate::~MLIREvaluate() {
+    for (auto pair : cached_const_buffers) {
+        std::free(pair.second.buffer);
+    }
 }
 
 bool MLIREvaluate::invoke_packed(std::vector<void*>& args) {
@@ -361,6 +471,35 @@ bool MLIROp::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs)
         x.append_to_packed_args(args);
     });
 
+    if (engine->folding_info.fold_func == nullptr) {  // No folding, call entry directly
+        OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] Call entry func directly\n");
+        return engine->invoke_packed(args);
+    }
+
+    for (auto id : engine->folding_info.fold_buffer_ids) {
+        memref_args.push_back(MemRefDescriptor(engine->cached_const_buffers[id]));
+    }
+
+    args.clear();
+    if (executed_ops.count(this) == 0) { // Call fold
+        for (auto id : engine->folding_info.fold_args) {
+            memref_args[id].append_to_packed_args(args);
+        }
+        OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] First executon, call fold func\n");
+        engine->folding_info.fold_func(args.data());
+
+        // TODO: Find a better way to check if the op has executed. 
+        // This is a const function and can not modify member attributes directly.
+        executed_ops.insert(this);
+    }
+
+    // Call entry
+    args.clear();
+    OPENVINO_MLIR_DEBUG_PRINT("[ DEBUG ] Call entry func\n");
+    for (auto id : engine->folding_info.compute_args) {
+        memref_args[id].append_to_packed_args(args);
+    }
+
     //std::cerr << "[ INFO ] Running kernel in MLIROp::evaluate\n";
     return engine->invoke_packed(args);
 }