add gpu-to-vector and nvvm lowering and handle dynamic shared memory

bytedance · Jun 23, 2024 · 0c7ed3e · 0c7ed3e
1 parent 59c2bbb
commit 0c7ed3e
Show file tree

Hide file tree

Showing 29 changed files with 1,768 additions and 30 deletions.
diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h
@@ -18,11 +18,14 @@
 #ifndef BYTEIR_DIALECT_GPU_PASSES_H
 #define BYTEIR_DIALECT_GPU_PASSES_H
 
+#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h"
 #include "byteir/Dialect/GPU/Transforms/GPUBlockSwizzle.h"
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h"
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h"
 #include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h"
+#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h"
 #include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h"
+#include "byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h"
 #include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h"
 #include "byteir/Dialect/GPU/Transforms/RemoveTrivialLoops.h"
 #include "mlir/Pass/Pass.h"

diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td
@@ -103,4 +103,36 @@ def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncO
   let constructor = "mlir::createGPUPackSharedMemoryAllocPass()";
 }
 
+//===----------------------------------------------------------------------===//
+// GPUPipelining
+//===----------------------------------------------------------------------===//
+def GPUPipelining : Pass<"gpu-pipelining", "func::FuncOp"> {
+  let summary = "Pipelining async copy and mma oprations to improve performance.";
+  let constructor = "mlir::createGPUPipeliningPass()";
+  let options = [
+    Option<"stages", "stages", "int64_t", /*default=*/"0", "the number of stages for pipelining">,
+  ];
+  let dependentDialects = [
+    "NVVM::NVVMDialect",
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// GPUVectorToGPU
+//===----------------------------------------------------------------------===//
+def GPUVectorToGPU : Pass<"gpu-vector-to-gpu", "func::FuncOp"> {
+  let summary = "Transform vector.contract to gpu.mma.sync.";
+  let constructor = "mlir::createGPUVectorToGPUPass()";
+  let dependentDialects = [
+    "nvgpu::NVGPUDialect",
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// LegalizeGPULaunch
+//===----------------------------------------------------------------------===//
+def LegalizeGPULaunch : Pass<"legalize-gpu-launch", "func::FuncOp"> {
+  let summary = "Legalize GPU launch ops.";
+  let constructor = "mlir::createLegalizeGPULaunchPass()";
+}
 #endif // BYTEIR_DIALECT_GPU_PASSES
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h b/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h
@@ -0,0 +1,36 @@
+//===- GPUPipelining.h ---------------------------------------*--- C++-*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+/// Pipelining async copy and mma oprations to improve performance.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createGPUPipeliningPass(int64_t stages = 0);
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h b/compiler/include/byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h
@@ -0,0 +1,35 @@
+//===- GPUVectorToGPU.h --------------------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createGPUVectorToGPUPass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h b/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h
@@ -0,0 +1,34 @@
+//===- LegalizeGPULaunch.h ---------------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeGPULaunchPass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/Utils.h b/compiler/include/byteir/Dialect/GPU/Transforms/Utils.h
@@ -51,6 +51,30 @@ static constexpr StringRef getCopyRelatedToWorkgroupMemoryMarker() {
 
 static constexpr StringRef getVectorizeMarker() { return "vectorizeMarker"; }
 
+static constexpr StringRef getAllocSharedMemoryAMarker() {
+  return "__byteir_alloca_matrix_a__";
+};
+
+static constexpr StringRef getAllocSharedMemoryBMarker() {
+  return "__byteir_alloca_matrix_b__";
+};
+
+static constexpr StringRef getAllocSharedMemoryAccMarker() {
+  return "__byteir_alloca_accumulator__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryAMarker() {
+  return "__byteir_load_matrix_a__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryBMarker() {
+  return "__byteir_load_matrix_b__";
+};
+
+static constexpr StringRef getCopyFromSharedMemoryAccMarker() {
+  return "__byteir_store_matrix_c__";
+};
+
 std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
 std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
 std::optional<int64_t> getGemmPipelineDepth(func::FuncOp funcOp);

diff --git a/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h b/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h
@@ -0,0 +1,81 @@
+//===- RemoveCopy.h -------------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
+#define BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+namespace mlir {
+class OpBuilder;
+class RewritePatternSet;
+class RewriterBase;
+class Value;
+class ValueRange;
+
+namespace arith {
+class WideIntEmulationConverter;
+class NarrowTypeEmulationConverter;
+} // namespace arith
+
+namespace memref {
+class AllocOp;
+class AllocaOp;
+class DeallocOp;
+
+/// Transformation to do multi-buffering/array expansion to remove dependencies
+/// on the temporary allocation between consecutive loop iterations.
+/// It returns the new allocation if the original allocation was multi-buffered
+/// and returns failure() otherwise.
+/// When `skipOverrideAnalysis`, the pass will apply the transformation
+/// without checking thwt the buffer is overrided at the beginning of each
+/// iteration. This implies that user knows that there is no data carried across
+/// loop iterations. Example:
+/// ```
+/// %0 = memref.alloc() : memref<4x128xf32>
+/// scf.for %iv = %c1 to %c1024 step %c3 {
+///   memref.copy %1, %0 : memref<4x128xf32> to memref<4x128xf32>
+///   "some_use"(%0) : (memref<4x128xf32>) -> ()
+/// }
+/// ```
+/// into:
+/// ```
+/// %0 = memref.alloc() : memref<5x4x128xf32>
+/// scf.for %iv = %c1 to %c1024 step %c3 {
+///   %s = arith.subi %iv, %c1 : index
+///   %d = arith.divsi %s, %c3 : index
+///   %i = arith.remsi %d, %c5 : index
+///   %sv = memref.subview %0[%i, 0, 0] [1, 4, 128] [1, 1, 1] :
+///     memref<5x4x128xf32> to memref<4x128xf32, strided<[128, 1], offset: ?>>
+///   memref.copy %1, %sv : memref<4x128xf32> to memref<4x128xf32, strided<...>>
+///   "some_use"(%sv) : (memref<4x128xf32, strided<...>) -> ()
+/// }
+/// ```
+template <typename AllocOpType>
+FailureOr<AllocOpType> multiBufferExt(RewriterBase &rewriter,
+                                      AllocOpType allocOp, unsigned multiplier,
+                                      bool skipOverrideAnalysis = false);
+/// Call into `multiBuffer` with  locally constructed IRRewriter.
+template <typename AllocOpType>
+FailureOr<AllocOpType> multiBufferExt(AllocOpType allocOp, unsigned multiplier,
+                                      bool skipOverrideAnalysis = false);
+
+} // namespace memref
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
diff --git a/compiler/include/byteir/Pipelines/GPU/GemmCodegen.h b/compiler/include/byteir/Pipelines/GPU/GemmCodegen.h
@@ -0,0 +1,88 @@
+//===- GemmCodegen.h -----------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
+#define BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+
+struct GPUGemmCodegenConfigOptions
+    : public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_gemm")};
+  ListOption<int64_t> tileSizeConfig{
+      *this, "tile-size-config",
+      llvm::cl::desc("An optional tile size config for tile matmul op.")};
+  ListOption<int64_t> workgroupSize{
+      *this, "workgroup-size",
+      llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
+  Option<int64_t> stages{
+      *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
+      llvm::cl::init(3)};
+};
+
+struct GPUGemmGeneralOptions
+    : public PassPipelineOptions<GPUGemmGeneralOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_gemm")};
+};
+
+void createGPUTileGemmTransform(OpPassManager &pm,
+                                const GPUGemmGeneralOptions &options);
+
+void createGPUAddGemmCodegenLoweringConfigTransform(
+    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);
+
+void createGPUPipeliningTransform(OpPassManager &pm,
+                                  const GPUGemmGeneralOptions &options);
+
+inline void registerGPUGemmCodegenPipelines() {
+  PassPipelineRegistration<GPUGemmGeneralOptions>(
+      "insert-gpu-tile-gemm-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUTileGemmTransform);
+  PassPipelineRegistration<GPUGemmCodegenConfigOptions>(
+      "insert-gpu-gemm-codegen-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUAddGemmCodegenLoweringConfigTransform);
+  PassPipelineRegistration<GPUGemmGeneralOptions>(
+      "insert-gpu-pipelining-transform",
+      "Insert transformation IR to tile linalg matmul op",
+      createGPUPipeliningTransform);
+}
+
+} // namespace mlir
+
+#endif // BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
diff --git a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp
@@ -129,6 +129,14 @@ class ConvertGPULaunchFuncToByrePattern
     computeOp->setAttr("BlockSize.y", rewriter.getI32IntegerAttr(by));
     computeOp->setAttr("BlockSize.z", rewriter.getI32IntegerAttr(bz));
 
+    auto sharedMemorySize = launchOp.getDynamicSharedMemorySize();
+    if (sharedMemorySize) {
+      auto sharedMemorySizeValue =
+          cast<arith::ConstantOp>(sharedMemorySize.getDefiningOp());
+      IntegerAttr smem = cast<IntegerAttr>(sharedMemorySizeValue.getValue());
+      computeOp->setAttr("DynamicSharedMemorySize", smem);
+    }
+
     if (useBarePtrCallConv) {
       computeOp->setAttr(byre::getKernelCallConventionAttrName(),
                          rewriter.getStringAttr("bare_ptr"));