Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEMM codegen] Add e2e pipeline #316

Open
wants to merge 49 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
0c7ed3e
add gpu-to-vector and nvvm lowering and handle dynamic shared memory
Jun 23, 2024
2f1dfc0
remove gpu pipelining
Jun 23, 2024
973466a
clang-format
Jun 23, 2024
d534ccb
dynamic shared mem support in runtime
Jun 23, 2024
17fbbb2
modify test file
Jun 23, 2024
d60f90c
handle kernel outline and add barrier at linalg promotion
Jun 23, 2024
1644860
add isLinalgOpMatmul
Jun 23, 2024
c4d3b35
fix bug of GPUPackSharedMemory
Jun 23, 2024
9dcf5d7
fix bug for single iteration
Jun 24, 2024
0b256ec
add shared memory swizzle
Jun 24, 2024
b976885
support fp16 fp16 fp32
Jun 24, 2024
fee4551
judge fp16 ext matmul
Jun 24, 2024
770a99c
fix bug for multibuffer
Jun 24, 2024
98f8f42
fix bug for pack shared memory, don't know why yet
Jun 24, 2024
c43cd7b
add whole pipeline
Jun 24, 2024
e991f04
add support for bmm
Jun 25, 2024
c17477d
add epilogue fusion support at LinalgTensorOp level
Jun 25, 2024
b8d7471
bug fix: GPUDistributeWarp tilesize calculation
Jun 25, 2024
0f7c2a7
support TF32x1
Jun 26, 2024
bce0e27
disable createGPUBlockSwizzlePass temporarily
Jun 26, 2024
f1b8d2e
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Jul 2, 2024
24a307d
fix test failure
Jul 2, 2024
18ae7bc
add enable_gemm_codegen option
Jul 2, 2024
e8e0279
small fix
Jul 2, 2024
0da6bd1
fix small bug
Jul 2, 2024
5999ee7
rename a utils and add support for threadblock swizzle in bmm
Jul 2, 2024
c6b0e59
clang-format
Jul 2, 2024
93e0780
add numerical test for gemm codegen
Jul 2, 2024
b5f4c1a
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Jul 2, 2024
9924fa8
bug fix
Jul 2, 2024
b490893
fix filecheck
Jul 3, 2024
36b10ac
fix
Jul 3, 2024
04b2050
fix according to xg's opinion
Jul 3, 2024
3e488e3
support epilogue fusion started from linalg-memref-opt
Jul 3, 2024
39671f6
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Jul 4, 2024
8e8b2e6
shape judgement
Jul 4, 2024
a372c78
fix tensor shape in matmulEpilogueFusion.mlir
Xinyu302 Jul 5, 2024
9987ca3
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Xinyu302 Jul 5, 2024
94fce00
Merge branch 'main' into gemm-e2e
Xinyu302 Jul 5, 2024
d7d714d
support epilogue fussion
Jul 5, 2024
9d53a1e
delete useless print
Jul 5, 2024
d93c6be
Merge branch 'main' into gemm-e2e
Jul 5, 2024
ee2ab65
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Jul 16, 2024
976eff8
add test case for matmul epilogue canonicalize
Jul 16, 2024
30f4ef8
to pass test
Jul 19, 2024
e929786
Merge remote-tracking branch 'upstream/main' into gemm-e2e
Xinyu302 Aug 12, 2024
aa396a0
fix bug
Xinyu302 Aug 15, 2024
a9e8f3b
renew test case
Xinyu302 Aug 15, 2024
d78c3d3
add gemm relu test
Xinyu302 Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler/include/byteir/Conversion/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "byteir/Conversion/ToLLVM/ToLLVM.h"
#include "byteir/Conversion/ToLinalg/ToLinalg.h"
#include "byteir/Conversion/ToPTX/ToPTX.h"
#include "byteir/Conversion/VectorToGPU/GPUVectorToGPU.h"

namespace mlir {

Expand Down
10 changes: 10 additions & 0 deletions compiler/include/byteir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ def GPUToNVVMExt : Pass<"gpu-to-nvvm-ext", "gpu::GPUModuleOp"> {
];
}


//===----------------------------------------------------------------------===//
// GPUVectorToGPU
//===----------------------------------------------------------------------===//
def GPUVectorToGPU : Pass<"gpu-vector-to-gpu", "func::FuncOp"> {
let summary = "Transform vector.contract to gpu.mma.sync.";
let constructor = "mlir::createGPUVectorToGPUPass()";
}


//===----------------------------------------------------------------------===//
// ToLinalg
//===----------------------------------------------------------------------===//
Expand Down
34 changes: 34 additions & 0 deletions compiler/include/byteir/Conversion/VectorToGPU/GPUVectorToGPU.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===- GPUVectorToGPU.h --------------------------------------*--- C++ -*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H
#define BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>> createGPUVectorToGPUPass();

} // namespace mlir

#endif // BYTEIR_CONVERSION_VECTORTOGPU_GPUVECTORTOGPU_H
2 changes: 2 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
#include "byteir/Dialect/GPU/Transforms/GPUBlockSwizzle.h"
#include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h"
#include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h"
#include "byteir/Dialect/GPU/Transforms/GPUInputSharedMemorySwizzle.h"
#include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h"
#include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h"
#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h"
#include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h"
#include "byteir/Dialect/GPU/Transforms/RemoveTrivialLoops.h"
#include "mlir/Pass/Pass.h"
Expand Down
18 changes: 18 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,24 @@ def GPUTensorCoreVectorization : Pass<"gpu-tensorcore-vectorization", "func::Fun
def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncOp"> {
let summary = "Analysis shared memory reuse and pack it into i8 alloc.";
let constructor = "mlir::createGPUPackSharedMemoryAllocPass()";
let dependentDialects = [
"nvgpu::NVGPUDialect",
];
}

//===----------------------------------------------------------------------===//
// LegalizeGPULaunch
//===----------------------------------------------------------------------===//
def LegalizeGPULaunch : Pass<"legalize-gpu-launch", "func::FuncOp"> {
let summary = "Legalize GPU launch ops.";
let constructor = "mlir::createLegalizeGPULaunchPass()";
}

//===----------------------------------------------------------------------===//
// GPUInputSharedMemorySwizzle
//===----------------------------------------------------------------------===//
def GPUInputSharedMemorySwizzle: Pass<"gpu-input-shared-memory-swizzle", "func::FuncOp"> {
let summary = "Swizzle shared memory for gemm's input to improve performance.";
let constructor = "mlir::createGPUInputSharedMemorySwizzlePass()";
}
#endif // BYTEIR_DIALECT_GPU_PASSES
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//===- GPUInputSharedMemorySwizzle.h ---------------------------------*---
// C++-*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H
#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>>
createGPUInputSharedMemorySwizzlePass();

} // namespace mlir

#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUINPUTSHAREDMEMORYSWIZZLE_H
34 changes: 34 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===- LegalizeGPULaunch.h ---------------------------------*--- C++ -*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
#define BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeGPULaunchPass();

} // namespace mlir

#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
42 changes: 40 additions & 2 deletions compiler/include/byteir/Dialect/GPU/Transforms/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,43 @@ static constexpr StringRef getCopyRelatedToWorkgroupMemoryMarker() {
return "__byteir_copy_related_to_workgroup_memory__";
}

static constexpr StringRef getVectorizeMarker() { return "vectorizeMarker"; }
static constexpr StringRef getVectorizeMarker() { return "vectorize"; }

static constexpr StringRef getAllocSharedMemoryAMarker() {
return "__byteir_alloca_matrix_a__";
};

static constexpr StringRef getAllocSharedMemoryBMarker() {
return "__byteir_alloca_matrix_b__";
};

static constexpr StringRef getAllocSharedMemoryAccMarker() {
return "__byteir_alloca_accumulator__";
};

static constexpr StringRef getCopyToSharedMemoryAMarker() {
return "__byteir_load_matrix_a__";
};

static constexpr StringRef getCopyToSharedMemoryBMarker() {
return "__byteir_load_matrix_b__";
};

static constexpr StringRef getCopyFromSharedMemoryAccMarker() {
return "__byteir_store_matrix_c__";
};

static constexpr StringRef getMatmulMainLoopMarker() {
return "__byteir_main_loop__";
}

constexpr StringRef getLinalgMMALevelAttrName() {
return "__byteir_mma_level__";
}

constexpr StringRef getMMAPatternAttrName() { return "__byteir_mma__"; }

static constexpr StringRef getEpilogueMarker() { return "__byteir_epilogue__"; }

std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
Expand All @@ -72,7 +108,7 @@ bool isMappedToGPUThreads(Operation *op);
// Get the ForallOp which mapped to threadblock level in a function.
// There should be only one valid ForallOp, otherwise the function will return
// std::nullopt;
std::optional<scf::ForallOp> getForallOpMappedTo2DBlock(func::FuncOp funcOp);
std::optional<scf::ForallOp> getForallOpMappedToBlock(func::FuncOp funcOp);

// Set a marker attribute on the operation.
// The marker is represented as a UnitAttr.
Expand Down Expand Up @@ -104,6 +140,8 @@ LogicalResult
distributeLinalgOpsWithFilter(IRRewriter &rewriter, Operation *root,
linalg::LinalgTilingOptions tilingOptions,
linalg_ext::LinalgTransformationFilter filter);

bool isLinalgOpMatmul(Operation *op);
} // namespace mlir

#endif // BYTEIR_UTILS_GPU_CODEGEN_UTILS_H
1 change: 1 addition & 0 deletions compiler/include/byteir/Dialect/Linalg/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define BYTEIR_DIALECT_LINALG_PASSES_H

#include "byteir/Dialect/Linalg/Transforms/Bufferize.h"
#include "byteir/Dialect/Linalg/Transforms/CanonicalizeMatmulEpilogue.h"
#include "byteir/Dialect/Linalg/Transforms/FuseElementwise.h"
#include "byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h"
#include "byteir/Dialect/Linalg/Transforms/LinalgDataPlace.h"
Expand Down
9 changes: 9 additions & 0 deletions compiler/include/byteir/Dialect/Linalg/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,13 @@ def LinalgGeneralizationExt : Pass<"linalg-generalization-ext", "func::FuncOp">
];
}

//===----------------------------------------------------------------------===//
// CanonicalizeMatmulEpilogue
//===----------------------------------------------------------------------===//

def CanonicalizeMatmulEpilogue : Pass<"canonicalize-matmul-epilogue", "func::FuncOp"> {
let summary = "Canonicalize matmul epilogue";
let constructor = "mlir::createCanonicalizeMatmulEpiloguePass()";
}

#endif // BYTEIR_DIALECT_LINALG_PASSES
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===- LinalgPromote.h --------------------------------------*--- C++ -*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H
#define BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>>
createCanonicalizeMatmulEpiloguePass();

} // namespace mlir

#endif // BYTEIR_DIALECT_LINALG_TRANSFORMS_CANONICALIZEMATMULEPILOGUE_H
88 changes: 88 additions & 0 deletions compiler/include/byteir/Pipelines/GPU/GemmCodegen.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//===- GemmCodegen.h -----------------------------------------*--- C++ -*-===//
//
// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
#define BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H

#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"

namespace mlir {

struct GPUGemmCodegenConfigOptions
: public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
Option<std::string> funcAnchor{
*this, "func-anchor",
llvm::cl::desc(
"An optional Unit attribute anchoring on target functions."),
llvm::cl::init("")};
Option<std::string> annotatePrefix{
*this, "annotate-prefix",
llvm::cl::desc("An optional annotate prefix attribute on target ops."),
llvm::cl::init("__byteir_gpu_tile_gemm")};
ListOption<int64_t> tileSizeConfig{
*this, "tile-size-config",
llvm::cl::desc("An optional tile size config for tile matmul op.")};
ListOption<int64_t> workgroupSize{
*this, "workgroup-size",
llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
Option<int64_t> stages{
*this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
llvm::cl::init(3)};
};

struct GPUGemmGeneralOptions
: public PassPipelineOptions<GPUGemmGeneralOptions> {
Option<std::string> funcAnchor{
*this, "func-anchor",
llvm::cl::desc(
"An optional Unit attribute anchoring on target functions."),
llvm::cl::init("")};
Option<std::string> annotatePrefix{
*this, "annotate-prefix",
llvm::cl::desc("An optional annotate prefix attribute on target ops."),
llvm::cl::init("__byteir_gpu_tile_gemm")};
};

void createGPUTileGemmTransform(OpPassManager &pm,
const GPUGemmGeneralOptions &options);

void createGPUAddGemmCodegenLoweringConfigTransform(
OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);

void createGPUPipeliningTransform(OpPassManager &pm,
const GPUGemmGeneralOptions &options);

inline void registerGPUGemmCodegenPipelines() {
PassPipelineRegistration<GPUGemmGeneralOptions>(
"insert-gpu-tile-gemm-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUTileGemmTransform);
PassPipelineRegistration<GPUGemmCodegenConfigOptions>(
"insert-gpu-gemm-codegen-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUAddGemmCodegenLoweringConfigTransform);
PassPipelineRegistration<GPUGemmGeneralOptions>(
"insert-gpu-pipelining-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUPipeliningTransform);
}

} // namespace mlir

#endif // BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
4 changes: 4 additions & 0 deletions compiler/include/byteir/Pipelines/HloFusionOpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ struct HloFusionOptPipelineOptions
*this, "outline-cat-op",
llvm::cl::desc("whether to outline cat ops and AIT as an backend"),
llvm::cl::init(false)};
Option<bool> outlineDotOp{
*this, "outline-dot-op",
llvm::cl::desc("whether to outline dot ops and use gemm codegen"),
llvm::cl::init(false)};
};

void createHloFusionOptPipeline(OpPassManager &pm,
Expand Down
1 change: 1 addition & 0 deletions compiler/lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ add_subdirectory(ToLinalg)
add_subdirectory(ToLLVM)
add_subdirectory(ToPTX)
add_subdirectory(LcclToByre)
add_subdirectory(VectorToGPU)
Loading
Loading