Skip to content

Commit

Permalink
Wave Reduce Intrinsics for Integer Type(32 bit) -> Operations: Add, S…
Browse files Browse the repository at this point in the history
…ub, Min, Max, AND, OR, XOR
  • Loading branch information
easyonaadit committed Oct 8, 2024
1 parent c808e66 commit 16e26e7
Show file tree
Hide file tree
Showing 21 changed files with 8,874 additions and 24 deletions.
12 changes: 9 additions & 3 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
def int_amdgcn_s_wqm :
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;

class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
[data_ty],
[
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
Expand All @@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {
foreach Op = Operations in {
def Op : AMDGPUWaveReduce;
}
}

defvar Operations = ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"];
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,8 +4846,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
125 changes: 115 additions & 10 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4861,10 +4861,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
switch(Opc){
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32:{
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:{
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
.addReg(Exec->getOperand(0).getReg());

switch(Opc){
case AMDGPU::S_XOR_B32:{
// Performing an XOR operation on a uniform value
// depends on the number of active lanes. If there
// are an even number of active lanes, then the XOR
// will result in 0. And if there are an odd number
// of Active lanes then the XOR will result in the
// same value as that in the SGPR. This comes from
// the fact that A^A = 0 and A^0 = A.

Register ParityRegister = MRI.createVirtualRegister(DstRegClass);

auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
case AMDGPU::S_SUB_I32:{
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_I32:{
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
RetBB = &BB;
}
}
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
Expand Down Expand Up @@ -4900,9 +4970,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue;
switch(Opc){
case AMDGPU::S_MIN_U32:
InitalValue = std::numeric_limits<uint32_t>::max();
break;
case AMDGPU::S_MIN_I32:
InitalValue = std::numeric_limits<int32_t>::max();
break;
case AMDGPU::S_MAX_U32:
InitalValue = 0;
break;
case AMDGPU::S_MAX_I32:
InitalValue = std::numeric_limits<int32_t>::min();
break;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
InitalValue = 0x00000000;
break;
case AMDGPU::S_AND_B32:
InitalValue = 0xFFFFFFFF;
}
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
Expand Down Expand Up @@ -4968,10 +5059,24 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
Expand Down Expand Up @@ -6859,7 +6964,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {

SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
// If all the operands are zero-extended to 32-bits, then we replace s_mul_u64
// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
Expand Down
39 changes: 37 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -255,15 +255,50 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
}
}

let usesCustomInserter = 1, Defs = [VCC] in {
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-ITERATIVE-NEXT: br label [[TMP14]]
; IR-ITERATIVE: 14:
; IR-ITERATIVE-NEXT: ret void
Expand All @@ -151,12 +151,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
; IR-DPP: 12:
; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-DPP-NEXT: br label [[TMP14]]
; IR-DPP: 14:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
ret void
}

Expand Down
Loading

0 comments on commit 16e26e7

Please sign in to comment.