Skip to content

Commit

Permalink
add gpu-to-vector and nvvm lowering and handle dynamic shared memory
Browse files Browse the repository at this point in the history
  • Loading branch information
yangxinyu committed Jun 23, 2024
1 parent 59c2bbb commit 0c7ed3e
Show file tree
Hide file tree
Showing 29 changed files with 1,768 additions and 30 deletions.
3 changes: 3 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
#ifndef BYTEIR_DIALECT_GPU_PASSES_H
#define BYTEIR_DIALECT_GPU_PASSES_H

#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h"
#include "byteir/Dialect/GPU/Transforms/GPUBlockSwizzle.h"
#include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h"
#include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h"
#include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h"
#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h"
#include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h"
#include "byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h"
#include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h"
#include "byteir/Dialect/GPU/Transforms/RemoveTrivialLoops.h"
#include "mlir/Pass/Pass.h"
Expand Down
32 changes: 32 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,36 @@ def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncO
let constructor = "mlir::createGPUPackSharedMemoryAllocPass()";
}

//===----------------------------------------------------------------------===//
// GPUPipelining
//===----------------------------------------------------------------------===//
def GPUPipelining : Pass<"gpu-pipelining", "func::FuncOp"> {
let summary = "Pipelining async copy and mma oprations to improve performance.";
let constructor = "mlir::createGPUPipeliningPass()";
let options = [
Option<"stages", "stages", "int64_t", /*default=*/"0", "the number of stages for pipelining">,
];
let dependentDialects = [
"NVVM::NVVMDialect",
];
}

//===----------------------------------------------------------------------===//
// GPUVectorToGPU
//===----------------------------------------------------------------------===//
def GPUVectorToGPU : Pass<"gpu-vector-to-gpu", "func::FuncOp"> {
let summary = "Transform vector.contract to gpu.mma.sync.";
let constructor = "mlir::createGPUVectorToGPUPass()";
let dependentDialects = [
"nvgpu::NVGPUDialect",
];
}

//===----------------------------------------------------------------------===//
// LegalizeGPULaunch
//===----------------------------------------------------------------------===//
def LegalizeGPULaunch : Pass<"legalize-gpu-launch", "func::FuncOp"> {
let summary = "Legalize GPU launch ops.";
let constructor = "mlir::createLegalizeGPULaunchPass()";
}
#endif // BYTEIR_DIALECT_GPU_PASSES
36 changes: 36 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//===- GPUPipelining.h ---------------------------------------*--- C++-*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

/// Pipelining async copy and mma oprations to improve performance.
std::unique_ptr<OperationPass<func::FuncOp>>
createGPUPipeliningPass(int64_t stages = 0);

} // namespace mlir

#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
35 changes: 35 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===- GPUVectorToGPU.h --------------------------------------*--- C++ -*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H
#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>>
createGPUVectorToGPUPass();

} // namespace mlir

#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUVECTORTOGPU_H
34 changes: 34 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===- LegalizeGPULaunch.h ---------------------------------*--- C++ -*-===//
//
// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
#define BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H

#include "mlir/Pass/Pass.h"
#include "llvm/ADT/StringRef.h"
#include <memory>

namespace mlir {
namespace func {
class FuncOp;
} // namespace func

std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeGPULaunchPass();

} // namespace mlir

#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H
24 changes: 24 additions & 0 deletions compiler/include/byteir/Dialect/GPU/Transforms/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,30 @@ static constexpr StringRef getCopyRelatedToWorkgroupMemoryMarker() {

static constexpr StringRef getVectorizeMarker() { return "vectorizeMarker"; }

static constexpr StringRef getAllocSharedMemoryAMarker() {
return "__byteir_alloca_matrix_a__";
};

static constexpr StringRef getAllocSharedMemoryBMarker() {
return "__byteir_alloca_matrix_b__";
};

static constexpr StringRef getAllocSharedMemoryAccMarker() {
return "__byteir_alloca_accumulator__";
};

static constexpr StringRef getCopyToSharedMemoryAMarker() {
return "__byteir_load_matrix_a__";
};

static constexpr StringRef getCopyToSharedMemoryBMarker() {
return "__byteir_load_matrix_b__";
};

static constexpr StringRef getCopyFromSharedMemoryAccMarker() {
return "__byteir_store_matrix_c__";
};

std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
std::optional<int64_t> getGemmPipelineDepth(func::FuncOp funcOp);
Expand Down
81 changes: 81 additions & 0 deletions compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
//===- RemoveCopy.h -------------------------------------------*--- C++ -*-===//
//
// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
#define BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H

#include "mlir/Support/LogicalResult.h"
#include "llvm/ADT/STLFunctionalExtras.h"

namespace mlir {
class OpBuilder;
class RewritePatternSet;
class RewriterBase;
class Value;
class ValueRange;

namespace arith {
class WideIntEmulationConverter;
class NarrowTypeEmulationConverter;
} // namespace arith

namespace memref {
class AllocOp;
class AllocaOp;
class DeallocOp;

/// Transformation to do multi-buffering/array expansion to remove dependencies
/// on the temporary allocation between consecutive loop iterations.
/// It returns the new allocation if the original allocation was multi-buffered
/// and returns failure() otherwise.
/// When `skipOverrideAnalysis`, the pass will apply the transformation
/// without checking thwt the buffer is overrided at the beginning of each
/// iteration. This implies that user knows that there is no data carried across
/// loop iterations. Example:
/// ```
/// %0 = memref.alloc() : memref<4x128xf32>
/// scf.for %iv = %c1 to %c1024 step %c3 {
/// memref.copy %1, %0 : memref<4x128xf32> to memref<4x128xf32>
/// "some_use"(%0) : (memref<4x128xf32>) -> ()
/// }
/// ```
/// into:
/// ```
/// %0 = memref.alloc() : memref<5x4x128xf32>
/// scf.for %iv = %c1 to %c1024 step %c3 {
/// %s = arith.subi %iv, %c1 : index
/// %d = arith.divsi %s, %c3 : index
/// %i = arith.remsi %d, %c5 : index
/// %sv = memref.subview %0[%i, 0, 0] [1, 4, 128] [1, 1, 1] :
/// memref<5x4x128xf32> to memref<4x128xf32, strided<[128, 1], offset: ?>>
/// memref.copy %1, %sv : memref<4x128xf32> to memref<4x128xf32, strided<...>>
/// "some_use"(%sv) : (memref<4x128xf32, strided<...>) -> ()
/// }
/// ```
template <typename AllocOpType>
FailureOr<AllocOpType> multiBufferExt(RewriterBase &rewriter,
AllocOpType allocOp, unsigned multiplier,
bool skipOverrideAnalysis = false);
/// Call into `multiBuffer` with locally constructed IRRewriter.
template <typename AllocOpType>
FailureOr<AllocOpType> multiBufferExt(AllocOpType allocOp, unsigned multiplier,
bool skipOverrideAnalysis = false);

} // namespace memref
} // namespace mlir

#endif // BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
88 changes: 88 additions & 0 deletions compiler/include/byteir/Pipelines/GPU/GemmCodegen.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//===- GemmCodegen.h -----------------------------------------*--- C++ -*-===//
//
// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#ifndef BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
#define BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H

#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"

namespace mlir {

struct GPUGemmCodegenConfigOptions
: public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
Option<std::string> funcAnchor{
*this, "func-anchor",
llvm::cl::desc(
"An optional Unit attribute anchoring on target functions."),
llvm::cl::init("")};
Option<std::string> annotatePrefix{
*this, "annotate-prefix",
llvm::cl::desc("An optional annotate prefix attribute on target ops."),
llvm::cl::init("__byteir_gpu_tile_gemm")};
ListOption<int64_t> tileSizeConfig{
*this, "tile-size-config",
llvm::cl::desc("An optional tile size config for tile matmul op.")};
ListOption<int64_t> workgroupSize{
*this, "workgroup-size",
llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
Option<int64_t> stages{
*this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
llvm::cl::init(3)};
};

struct GPUGemmGeneralOptions
: public PassPipelineOptions<GPUGemmGeneralOptions> {
Option<std::string> funcAnchor{
*this, "func-anchor",
llvm::cl::desc(
"An optional Unit attribute anchoring on target functions."),
llvm::cl::init("")};
Option<std::string> annotatePrefix{
*this, "annotate-prefix",
llvm::cl::desc("An optional annotate prefix attribute on target ops."),
llvm::cl::init("__byteir_gpu_tile_gemm")};
};

void createGPUTileGemmTransform(OpPassManager &pm,
const GPUGemmGeneralOptions &options);

void createGPUAddGemmCodegenLoweringConfigTransform(
OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);

void createGPUPipeliningTransform(OpPassManager &pm,
const GPUGemmGeneralOptions &options);

inline void registerGPUGemmCodegenPipelines() {
PassPipelineRegistration<GPUGemmGeneralOptions>(
"insert-gpu-tile-gemm-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUTileGemmTransform);
PassPipelineRegistration<GPUGemmCodegenConfigOptions>(
"insert-gpu-gemm-codegen-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUAddGemmCodegenLoweringConfigTransform);
PassPipelineRegistration<GPUGemmGeneralOptions>(
"insert-gpu-pipelining-transform",
"Insert transformation IR to tile linalg matmul op",
createGPUPipeliningTransform);
}

} // namespace mlir

#endif // BYTEIR_PIPELINES_GPU_GEMM_CODEGEN_H
8 changes: 8 additions & 0 deletions compiler/lib/Conversion/FuncToByre/FuncToByre.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ class ConvertGPULaunchFuncToByrePattern
computeOp->setAttr("BlockSize.y", rewriter.getI32IntegerAttr(by));
computeOp->setAttr("BlockSize.z", rewriter.getI32IntegerAttr(bz));

auto sharedMemorySize = launchOp.getDynamicSharedMemorySize();
if (sharedMemorySize) {
auto sharedMemorySizeValue =
cast<arith::ConstantOp>(sharedMemorySize.getDefiningOp());
IntegerAttr smem = cast<IntegerAttr>(sharedMemorySizeValue.getValue());
computeOp->setAttr("DynamicSharedMemorySize", smem);
}

if (useBarePtrCallConv) {
computeOp->setAttr(byre::getKernelCallConventionAttrName(),
rewriter.getStringAttr("bare_ptr"));
Expand Down
Loading

0 comments on commit 0c7ed3e

Please sign in to comment.