Skip to content

Commit

Permalink
Wave Reduce Intrinsics for Integer Type(32 bit) -> Operations: Add, S…
Browse files Browse the repository at this point in the history
…ub, Min, Max, AND, OR, XOR
  • Loading branch information
easyonaadit committed Oct 20, 2024
1 parent c808e66 commit 490e21e
Show file tree
Hide file tree
Showing 24 changed files with 11,484 additions and 46 deletions.
10 changes: 8 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {
foreach Op = Operations in {
def Op : AMDGPUWaveReduce;
}
}

defvar Operations = ["umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"];
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,8 +4846,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_uadd:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_usub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
117 changes: 110 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,6 +4846,26 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}

static uint32_t getInitialValueForWaveReduction(unsigned Opc){
switch(Opc){
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::S_MIN_I32:
return std::numeric_limits<int32_t>::max();
case AMDGPU::S_MAX_U32:
return 0;
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return 0x00000000;
case AMDGPU::S_AND_B32:
return 0xFFFFFFFF;
}
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
Expand All @@ -4861,10 +4881,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
switch(Opc){
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32:{
// Idempotent operations.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:{
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
.addReg(Exec->getOperand(0).getReg());

switch(Opc){
case AMDGPU::S_XOR_B32:{
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);

auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
case AMDGPU::S_SUB_I32:{
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_I32:{
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
RetBB = &BB;
}
}
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
Expand Down Expand Up @@ -4900,9 +4985,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);

auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
Expand Down Expand Up @@ -4970,8 +5055,26 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
Expand Down
45 changes: 45 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,55 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_UADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_USUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
}

def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
}
}

let usesCustomInserter = 1, Defs = [VCC] in {
Expand Down
Loading

0 comments on commit 490e21e

Please sign in to comment.