bytedance · Xinyu302 · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024
diff --git a/compiler/include/byteir/Conversion/Passes.h b/compiler/include/byteir/Conversion/Passes.h
@@ -34,6 +34,7 @@
 #include "byteir/Conversion/ToLLVM/ToLLVM.h"
 #include "byteir/Conversion/ToLinalg/ToLinalg.h"
 #include "byteir/Conversion/ToPTX/ToPTX.h"
+#include "byteir/Conversion/VectorToGPU/GPUVectorToGPU.h"
 
 namespace mlir {
 

diff --git a/compiler/include/byteir/Conversion/Passes.td b/compiler/include/byteir/Conversion/Passes.td
@@ -45,6 +45,16 @@ def GPUToNVVMExt : Pass<"gpu-to-nvvm-ext", "gpu::GPUModuleOp"> {
   ];
 }
 
+
+//===----------------------------------------------------------------------===//
+// GPUVectorToGPU
+//===----------------------------------------------------------------------===//
+def GPUVectorToGPU : Pass<"gpu-vector-to-gpu", "func::FuncOp"> {
+  let summary = "Transform vector.contract to gpu.mma.sync.";
+  let constructor = "mlir::createGPUVectorToGPUPass()";
+}
+
+
 //===----------------------------------------------------------------------===//
 // ToLinalg
 //===----------------------------------------------------------------------===//

diff --git a/compiler/include/byteir/Conversion/VectorToGPU/GPUVectorToGPU.h b/compiler/include/byteir/Conversion/VectorToGPU/GPUVectorToGPU.h
@@ -0,0 +1,34 @@
+//===- GPUVectorToGPU.h --------------------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H
+#define BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>> createGPUVectorToGPUPass();
+
+} // namespace mlir
+
+#endif // BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H
diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h
@@ -21,8 +21,10 @@
 #include "byteir/Dialect/GPU/Transforms/GPUBlockSwizzle.h"
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h"
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h"
+#include "byteir/Dialect/GPU/Transforms/GPUInputSharedMemorySwizzle.h"
 #include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h"
 #include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h"
+#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h"
 #include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h"
 #include "byteir/Dialect/GPU/Transforms/RemoveTrivialLoops.h"
 #include "mlir/Pass/Pass.h"

diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td
@@ -101,6 +101,24 @@ def GPUTensorCoreVectorization : Pass<"gpu-tensorcore-vectorization", "func::Fun
 def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncOp"> {
   let summary = "Analysis shared memory reuse and pack it into i8 alloc.";
   let constructor = "mlir::createGPUPackSharedMemoryAllocPass()";
+  let dependentDialects = [
+    "nvgpu::NVGPUDialect",
+  ];
 }
 
+//===----------------------------------------------------------------------===//
+// LegalizeGPULaunch
+//===----------------------------------------------------------------------===//
+def LegalizeGPULaunch : Pass<"legalize-gpu-launch", "func::FuncOp"> {
+  let summary = "Legalize GPU launch ops.";
+  let constructor = "mlir::createLegalizeGPULaunchPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUInputSharedMemorySwizzle
+//===----------------------------------------------------------------------===//
+def GPUInputSharedMemorySwizzle: Pass<"gpu-input-shared-memory-swizzle", "func::FuncOp"> {
+  let summary = "Swizzle shared memory for gemm's input to improve performance.";
+  let constructor = "mlir::createGPUInputSharedMemorySwizzlePass()";
+}
 #endif // BYTEIR_DIALECT_GPU_PASSES
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/GPUInputSharedMemorySwizzle.h b/compiler/include/byteir/Dialect/GPU/Transforms/GPUInputSharedMemorySwizzle.h
@@ -0,0 +1,36 @@
+//===- GPUInputSharedMemorySwizzle.h ---------------------------------*---
+// C++-*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createGPUInputSharedMemorySwizzlePass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h b/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h
@@ -0,0 +1,34 @@
+//===- LegalizeGPULaunch.h ---------------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeGPULaunchPass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/Utils.h b/compiler/include/byteir/Dialect/GPU/Transforms/Utils.h
@@ -49,7 +49,43 @@ static constexpr StringRef getCopyRelatedToWorkgroupMemoryMarker() {
   return "__byteir_copy_related_to_workgroup_memory__";
 }
 
-static constexpr StringRef getVectorizeMarker() { return "vectorizeMarker"; }
+static constexpr StringRef getVectorizeMarker() { return "vectorize"; }
+
+static constexpr StringRef getAllocSharedMemoryAMarker() {
+  return "__byteir_alloca_matrix_a__";
+};
+
+static constexpr StringRef getAllocSharedMemoryBMarker() {
+  return "__byteir_alloca_matrix_b__";
+};
+
+static constexpr StringRef getAllocSharedMemoryAccMarker() {
+  return "__byteir_alloca_accumulator__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryAMarker() {
+  return "__byteir_load_matrix_a__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryBMarker() {
+  return "__byteir_load_matrix_b__";
+};
+
+static constexpr StringRef getCopyFromSharedMemoryAccMarker() {
+  return "__byteir_store_matrix_c__";
+};
+
+static constexpr StringRef getMatmulMainLoopMarker() {
+  return "__byteir_main_loop__";
+}
+
+constexpr StringRef getLinalgMMALevelAttrName() {
+  return "__byteir_mma_level__";
+}
+
+constexpr StringRef getMMAPatternAttrName() { return "__byteir_mma__"; }
+
+static constexpr StringRef getEpilogueMarker() { return "__byteir_epilogue__"; }
 
 std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
 std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
@@ -72,7 +108,7 @@ bool isMappedToGPUThreads(Operation *op);
 // Get the ForallOp which mapped to threadblock level in a function.
 // There should be only one valid ForallOp, otherwise the function will return
 // std::nullopt;
-std::optional<scf::ForallOp> getForallOpMappedTo2DBlock(func::FuncOp funcOp);
+std::optional<scf::ForallOp> getForallOpMappedToBlock(func::FuncOp funcOp);
 
 // Set a marker attribute on the operation.
 // The marker is represented as a UnitAttr.
@@ -104,6 +140,8 @@ LogicalResult
 distributeLinalgOpsWithFilter(IRRewriter &rewriter, Operation *root,
                               linalg::LinalgTilingOptions tilingOptions,
                               linalg_ext::LinalgTransformationFilter filter);
+
+bool isLinalgOpMatmul(Operation *op);
 } // namespace mlir
 
 #endif // BYTEIR_UTILS_GPU_CODEGEN_UTILS_H
diff --git a/compiler/include/byteir/Dialect/Linalg/Passes.h b/compiler/include/byteir/Dialect/Linalg/Passes.h
@@ -19,6 +19,7 @@
 #define BYTEIR_DIALECT_LINALG_PASSES_H
 
 #include "byteir/Dialect/Linalg/Transforms/Bufferize.h"
+#include "byteir/Dialect/Linalg/Transforms/CanonicalizeMatmulEpilogue.h"
 #include "byteir/Dialect/Linalg/Transforms/FuseElementwise.h"
 #include "byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h"
 #include "byteir/Dialect/Linalg/Transforms/LinalgDataPlace.h"

diff --git a/compiler/include/byteir/Dialect/Linalg/Passes.td b/compiler/include/byteir/Dialect/Linalg/Passes.td
@@ -198,4 +198,13 @@ def LinalgGeneralizationExt : Pass<"linalg-generalization-ext", "func::FuncOp">
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// CanonicalizeMatmulEpilogue
+//===----------------------------------------------------------------------===//
+
+def CanonicalizeMatmulEpilogue : Pass<"canonicalize-matmul-epilogue", "func::FuncOp"> {
+  let summary = "Canonicalize matmul epilogue";
+  let constructor = "mlir::createCanonicalizeMatmulEpiloguePass()";
+}
+
 #endif // BYTEIR_DIALECT_LINALG_PASSES
diff --git a/compiler/include/byteir/Dialect/Linalg/Transforms/CanonicalizeMatmulEpilogue.h b/compiler/include/byteir/Dialect/Linalg/Transforms/CanonicalizeMatmulEpilogue.h
@@ -0,0 +1,35 @@
+//===- LinalgPromote.h --------------------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H
+#define BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createCanonicalizeMatmulEpiloguePass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H
diff --git a/compiler/include/byteir/Pipelines/GPU/GemmCodegen.h b/compiler/include/byteir/Pipelines/GPU/GemmCodegen.h
@@ -0,0 +1,88 @@
+//===- GemmCodegen.h -----------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
+#define BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+
+struct GPUGemmCodegenConfigOptions
+    : public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_gemm")};
+  ListOption<int64_t> tileSizeConfig{
+      *this, "tile-size-config",
+      llvm::cl::desc("An optional tile size config for tile matmul op.")};
+  ListOption<int64_t> workgroupSize{
+      *this, "workgroup-size",
+      llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
+  Option<int64_t> stages{
+      *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
+      llvm::cl::init(3)};
+};
+
+struct GPUGemmGeneralOptions
+    : public PassPipelineOptions<GPUGemmGeneralOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_gemm")};
+};
+
+void createGPUTileGemmTransform(OpPassManager &pm,
+                                const GPUGemmGeneralOptions &options);
+
+void createGPUAddGemmCodegenLoweringConfigTransform(
+    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);
+
+void createGPUPipeliningTransform(OpPassManager &pm,
+                                  const GPUGemmGeneralOptions &options);
+
+inline void registerGPUGemmCodegenPipelines() {
+  PassPipelineRegistration<GPUGemmGeneralOptions>(
+      "insert-gpu-tile-gemm-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUTileGemmTransform);
+  PassPipelineRegistration<GPUGemmCodegenConfigOptions>(
+      "insert-gpu-gemm-codegen-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUAddGemmCodegenLoweringConfigTransform);
+  PassPipelineRegistration<GPUGemmGeneralOptions>(
+      "insert-gpu-pipelining-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUPipeliningTransform);
+}
+
+} // namespace mlir
+
+#endif // BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
diff --git a/compiler/include/byteir/Pipelines/HloFusionOpt.h b/compiler/include/byteir/Pipelines/HloFusionOpt.h
@@ -47,6 +47,10 @@ struct HloFusionOptPipelineOptions
       *this, "outline-cat-op",
       llvm::cl::desc("whether to outline cat ops and AIT as an backend"),
       llvm::cl::init(false)};
+  Option<bool> outlineDotOp{
+      *this, "outline-dot-op",
+      llvm::cl::desc("whether to outline dot ops and use gemm codegen"),
+      llvm::cl::init(false)};
 };
 
 void createHloFusionOptPipeline(OpPassManager &pm,

diff --git a/compiler/lib/Conversion/CMakeLists.txt b/compiler/lib/Conversion/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(ToLinalg)
 add_subdirectory(ToLLVM)
 add_subdirectory(ToPTX)
 add_subdirectory(LcclToByre)
+add_subdirectory(VectorToGPU)