bytedance · YellowHCH · May 25, 2024 · May 25, 2024 · May 25, 2024 · May 25, 2024
diff --git a/compiler/include/byteir/Dialect/SCF/Passes.h b/compiler/include/byteir/Dialect/SCF/Passes.h
@@ -18,6 +18,7 @@
 #ifndef BYTEIR_DIALECT_SCF_PASSES_H
 #define BYTEIR_DIALECT_SCF_PASSES_H
 
+#include "byteir/Dialect/SCF/Transforms/FuseNestedForall.h"
 #include "byteir/Dialect/SCF/Transforms/InsertTrivialSCFLoop.h"
 
 namespace mlir {

diff --git a/compiler/include/byteir/Dialect/SCF/Passes.td b/compiler/include/byteir/Dialect/SCF/Passes.td
@@ -38,4 +38,21 @@ def InsertTrivialSCFLoop : Pass<"insert-trivial-scf-loop", "mlir::func::FuncOp">
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// FuseNestedForall
+//===----------------------------------------------------------------------===//
+
+def FuseNestedForall : Pass<"fuse-nested-forall", "mlir::func::FuncOp"> {
+  let summary = "Fuse nested forall if possible";
+  let constructor = "mlir::createFuseNestedForallPass()";
+  let dependentDialects = [
+    "scf::SCFDialect"
+  ];
+  let options = [
+    Option<"anchorTag", "anchor-tag", "std::string",
+            /*default=*/"",
+            "Optional unitAttr anchored tag to apply this pass">
+  ];
+}
+
 #endif // BYTEIR_DIALECT_SCF_PASSES
diff --git a/compiler/include/byteir/Dialect/SCF/Transforms/FuseNestedForall.h b/compiler/include/byteir/Dialect/SCF/Transforms/FuseNestedForall.h
@@ -0,0 +1,34 @@
+//===- FuseNestedForall.h ------------------------------------- C++ --===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_SCF_TRANSFORMS_FUSENESTEDFORALL_H
+#define BYTEIR_DIALECT_SCF_TRANSFORMS_FUSENESTEDFORALL_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createFuseNestedForallPass(llvm::StringRef anchorTag = "");
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_SCF_TRANSFORMS_FUSENESTEDFORALL_H
diff --git a/compiler/include/byteir/Dialect/Transform/Passes.td b/compiler/include/byteir/Dialect/Transform/Passes.td
@@ -42,6 +42,9 @@ def DetensorizeTransformInsertion : Pass<"insert-detensorize-transform", "Module
   let summary = "Insert detensorize transform IR to functions.";
   let constructor = "mlir::createDetensorizeTransformInsertionPass()";
   let options = [
+    Option<"usingVectorizeOp", "using-vectorize-op", "bool",
+            /*default=*/"false",
+            "using vectorizeOp to detensorize linalg op.">,
     Option<"funcAnchorAttr", "func-anchor", "std::string",
             /*default=*/"",
             "An optional Unit attribute anchoring on target functions.">,

diff --git a/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h b/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h
@@ -39,7 +39,7 @@ createGenericTransformInsertionPass(const TransformInsertionConfig &config);
 
 std::unique_ptr<OperationPass<ModuleOp>>
 createDetensorizeTransformInsertionPass(
-    const std::string &funcAnchor = "",
+    const bool usingVectorizeOp = false, const std::string &funcAnchor = "",
     const std::string &matchPrefix = "__byteir_detensorize");
 
 std::unique_ptr<OperationPass<ModuleOp>> createFuseExtTransformInsertionPass(

diff --git a/compiler/include/byteir/Dialect/Vector/Transforms/MoveForallRegionIntoWarpOp.h b/compiler/include/byteir/Dialect/Vector/Transforms/MoveForallRegionIntoWarpOp.h
@@ -0,0 +1,39 @@
+//===- MoveForallRegionIntoWarpOp.h ---------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_SCF_TRANSFORMS_MOVEFORALLREGIONINTOWARPOP_H
+#define BYTEIR_DIALECT_SCF_TRANSFORMS_MOVEFORALLREGIONINTOWARPOP_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+constexpr StringRef getMoveForallRegionIntoWarpOpAttrName() {
+  return "__byteir_move_forall_region_into_warp_execute_on_lane0";
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createMoveForallRegionIntoWarpOpPass(int64_t warpSize = 32,
+                                     llvm::StringRef anchorTag = "");
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_SCF_TRANSFORMS_MOVEFORALLREGIONINTOWARPOP_H
diff --git a/compiler/include/byteir/Dialect/Vector/Transforms/Passes.h b/compiler/include/byteir/Dialect/Vector/Transforms/Passes.h
@@ -18,13 +18,20 @@
 #ifndef BYTEIR_DIALECT_VECTOR_TRANSFORMS_PASSES_H
 #define BYTEIR_DIALECT_VECTOR_TRANSFORMS_PASSES_H
 
+#include "byteir/Dialect/Vector/Transforms/MoveForallRegionIntoWarpOp.h"
+#include "byteir/Dialect/Vector/Transforms/VectorWarpDistribute.h"
 #include "mlir/Pass/Pass.h"
 #include <memory>
 
 namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
 
 /// Generate the code for registering transforms passes.
 #define GEN_PASS_DECL_VECTORTRANSPOSELOWERINGPASS
+#define GEN_PASS_DECL_MOVEFORALLREGIONINTOWARPOPPASS
+#define GEN_PASS_DECL_SCALARVECTORLOWERINGPASS
 #define GEN_PASS_REGISTRATION
 #include "byteir/Dialect/Vector/Transforms/Passes.h.inc"
 

diff --git a/compiler/include/byteir/Dialect/Vector/Transforms/Passes.td b/compiler/include/byteir/Dialect/Vector/Transforms/Passes.td
@@ -36,5 +36,72 @@ def VectorTransposeLoweringPass : Pass<"vector-transpose-lowering", "func::FuncO
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// Move Forall Region Into WarpOp
+//===----------------------------------------------------------------------===//
+
+def MoveForallRegionIntoWarpOpPass : Pass<"move-forall-region-into-warp-op", "mlir::func::FuncOp"> {
+  let summary = "move region of forall into warp_execute_on_lane_0 op";
+  let constructor = "mlir::createMoveForallRegionIntoWarpOpPass()";
+  let dependentDialects = [
+    "memref::MemRefDialect",
+    "vector::VectorDialect",
+    "gpu::GPUDialect",
+  ];
+  let options = [
+    Option<"warpSize", "warp-size", "int64_t", "32", "warp size">,
+    Option<"anchorTag", "anchor-tag", "std::string",
+            /*default=*/"",
+            "Optional unitAttr anchored tag to apply this pass">
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// Vector Warp Distribute
+//===----------------------------------------------------------------------===//
 
+def VectorWarpDistributePass : Pass<"vector-warp-distribute", "mlir::func::FuncOp"> {
+  let summary = "vector warp distribute transformation";
+  let constructor = "mlir::createVectorWarpDistributePass()";
+  let dependentDialects = [
+    "scf::SCFDialect",
+    "memref::MemRefDialect",
+    "vector::VectorDialect",
+    "gpu::GPUDialect",
+    "affine::AffineDialect",
+  ];
+  let options = [
+    Option<"warpOpToSCF", "rewrite-warp-ops-to-scf-if", "bool",
+          /*default=*/"false",
+          "Lower vector.warp_execute_on_lane0 to scf.if op">,
+
+    Option<"distributeTransferWriteOps", "distribute-transfer-write", "bool",
+          /*default=*/"false",
+          "distribution of transfer write">,
+
+    Option<"hoistUniform", "hoist-uniform", "bool",
+          /*default=*/"false",
+          "hoist-uniform">,
+
+    Option<"propagateDistribution", "propagate-distribution", "bool",
+          /*default=*/"false",
+          "distribution propgation">,
+
+    Option<"maxTransferWriteElements", "max-transfer-write-elements", "int64_t",
+          /*default=*/"1",
+          "Maximum number of transfer write elements to distribute">,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Vector Lowering
+//===----------------------------------------------------------------------===//
+
+def ScalarVectorLoweringPass : Pass<"scalar-vector-lowering", "func::FuncOp"> {
+  let summary = "Pass to lower scalar vector";
+  let dependentDialects = [
+    "memref::MemRefDialect",
+    "vector::VectorDialect"
+  ];
+}
 #endif // BYTEIR_DIALECT_VECTOR_TRANSFORMS_PASSES
diff --git a/compiler/include/byteir/Dialect/Vector/Transforms/VectorWarpDistribute.h b/compiler/include/byteir/Dialect/Vector/Transforms/VectorWarpDistribute.h
@@ -0,0 +1,38 @@
+//===- VectorWarpDistribute.h ---------------------------*--- C++ -*-===//
+//
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_SCF_TRANSFORMS_VECTORWARPDISTRIBUTE_H
+#define BYTEIR_DIALECT_SCF_TRANSFORMS_VECTORWARPDISTRIBUTE_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+
+#define GEN_PASS_DECL_VECTORWARPDISTRIBUTEPASS
+#include "byteir/Dialect/Vector/Transforms/Passes.h.inc"
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createVectorWarpDistributePass(const VectorWarpDistributePassOptions &options =
+                                   VectorWarpDistributePassOptions());
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_SCF_TRANSFORMS_VECTORWARPDISTRIBUTE_H
diff --git a/compiler/include/byteir/Dialect/mhlo/Passes.h b/compiler/include/byteir/Dialect/mhlo/Passes.h
@@ -62,6 +62,10 @@ inline void registerByteIRMhloPassesExt() {
     return mlir::createConcatSliceFusionPass();
   });
 
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return mlir::createInsertSliceWithElemwiseFusionPass();
+  });
+
   // register createCatFusionPass
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
     return mlir::createCatFusionPass();

diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h b/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h
@@ -95,6 +95,9 @@ createElementFusionPass(bool clusterSingleElemwiseOp = false,
 
 std::unique_ptr<OperationPass<func::FuncOp>> createConcatSliceFusionPass();
 
+std::unique_ptr<OperationPass<func::FuncOp>>
+createInsertSliceWithElemwiseFusionPass();
+
 std::unique_ptr<OperationPass<func::FuncOp>> createMatmulEpilogueFusionPass();
 
 std::unique_ptr<OperationPass<func::FuncOp>> createIOConvertFusionPass();

diff --git a/compiler/include/byteir/Pipelines/GPU/MappingForall.h b/compiler/include/byteir/Pipelines/GPU/MappingForall.h
@@ -18,6 +18,7 @@
 #ifndef BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H
 #define BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H
 
+#include "byteir/Utils/OptionUtils.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -34,6 +35,12 @@ struct GPUMappingForallOptions
       *this, "annotate-prefix",
       llvm::cl::desc("An optional annotate prefix attribute on target ops."),
       llvm::cl::init("__byteir_gpu_split_grid_reduction")};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size."),
+                           llvm::cl::init(32)};
+  Option<llvm::cl::KernelDims> blockDimsHint{
+      *this, "block-size-hint",
+      llvm::cl::desc("block dims hint for dynamic shape."),
+      llvm::cl::init(llvm::cl::KernelDims{1024, 1, 1})};
   // TODO: option for grid/block dims hint
 };
 

diff --git a/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h b/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h
@@ -54,9 +54,6 @@ struct GPUTileGridReductionOptions
                            llvm::cl::init(32)};
   Option<int64_t> blockSize{*this, "block-size", llvm::cl::desc("block size"),
                             llvm::cl::init(256)};
-  Option<bool> usingForall{*this, "using-forall",
-                           llvm::cl::desc("using forall"),
-                           llvm::cl::init(true)};
 };
 
 struct GPUSplitBlockReductionOptions
@@ -92,9 +89,44 @@ struct GPUTileBlockReductionOptions
                            llvm::cl::init(32)};
   Option<int64_t> blockSize{*this, "block-size", llvm::cl::desc("block size"),
                             llvm::cl::init(256)};
-  Option<bool> usingForall{*this, "using-forall",
-                           llvm::cl::desc("using forall"),
-                           llvm::cl::init(true)};
+};
+
+struct GPUTileSplitWarpReductionOptions
+    : public PassPipelineOptions<GPUTileSplitWarpReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_split_warp_reduction")};
+  Option<int64_t> blockSize{*this, "block-size", llvm::cl::desc("block size"),
+                            llvm::cl::init(256)};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size"),
+                           llvm::cl::init(32)};
+};
+
+struct GPUTileWarpReductionOptions
+    : public PassPipelineOptions<GPUTileWarpReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_warp_reduction")};
+  Option<int64_t> splitFactor{*this, "split-factor",
+                              llvm::cl::desc("split factor"),
+                              llvm::cl::init(32)};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size"),
+                           llvm::cl::init(32)};
+  Option<bool> usingGPUShuffle{*this, "using-gpu-shuffle",
+                               llvm::cl::desc("using gpu shuffle"),
+                               llvm::cl::init(true)};
 };
 
 struct GPUTileThreadReductionOptions
@@ -118,6 +150,10 @@ void createGPUSplitBlockReductionTransform(
     OpPassManager &pm, const GPUSplitBlockReductionOptions &options);
 void createGPUTileBlockReductionTransform(
     OpPassManager &pm, const GPUTileBlockReductionOptions &options);
+void createGPUTileSplitWarpReductionTransform(
+    OpPassManager &pm, const GPUTileSplitWarpReductionOptions &options);
+void createGPUTileWarpReductionTransform(
+    OpPassManager &pm, const GPUTileWarpReductionOptions &options);
 void createGPUTileThreadReductionTransform(
     OpPassManager &pm, const GPUTileThreadReductionOptions &options);
 
@@ -142,6 +178,16 @@ inline void registerGPUReductionCodegenPipelines() {
       "Insert transformation IR to tile linalg reduction op",
       createGPUTileBlockReductionTransform);
 
+  PassPipelineRegistration<GPUTileSplitWarpReductionOptions>(
+      "insert-gpu-tile-split-warp-reduction-transform",
+      "Insert transformation IR to split block reduction to warp",
+      createGPUTileSplitWarpReductionTransform);
+
+  PassPipelineRegistration<GPUTileWarpReductionOptions>(
+      "insert-gpu-tile-warp-reduction-transform",
+      "Insert transformation IR to vectorize warp redution",
+      createGPUTileWarpReductionTransform);
+
   PassPipelineRegistration<GPUTileThreadReductionOptions>(
       "insert-gpu-tile-thread-reduction-transform",
       "Insert transformation IR to tile linalg reduction op",