diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 4cd32a0502c66d..816078db3b1e06 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask : def int_amdgcn_s_wqm : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; -class AMDGPUWaveReduce : Intrinsic< +class AMDGPUWaveReduce : Intrinsic< [data_ty], [ LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) @@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce : Intrinsic< ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg>]>; -def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; -def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +multiclass AMDGPUWaveReduceGenerator Operations> { + foreach Op = Operations in { + def Op : AMDGPUWaveReduce; + } +} + +defvar Operations = ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"]; +defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator; def int_amdgcn_readfirstlane : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bc771d4ef6c080..468ee031a60048 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4846,8 +4846,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } - case Intrinsic::amdgcn_wave_reduce_umin: - case Intrinsic::amdgcn_wave_reduce_umax: { + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_min: + case Intrinsic::amdgcn_wave_reduce_umin: + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 885ecab891b1f5..41b3fc65c8bf06 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4861,10 +4861,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); MachineBasicBlock *RetBB = nullptr; if (isSGPR) { - // These operations with a uniform value i.e. SGPR are idempotent. - // Reduced value will be same as given sgpr. - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); - RetBB = &BB; + switch(Opc){ + case AMDGPU::S_MIN_U32: + case AMDGPU::S_MIN_I32: + case AMDGPU::S_MAX_U32: + case AMDGPU::S_MAX_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32:{ + // These operations with a uniform value i.e. SGPR are idempotent. + // Reduced value will be same as given sgpr. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); + RetBB = &BB; + break; + } + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32:{ + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); + const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); + Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); + Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass); + + bool IsWave32 = ST.isWave32(); + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; + + // Create initail values of induction variable from Exec, Accumulator and + // insert branch instr to newly created ComputeBlock + auto Exec = + BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); + + auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg) + .addReg(Exec->getOperand(0).getReg()); + + switch(Opc){ + case AMDGPU::S_XOR_B32:{ + // Performing an XOR operation on a uniform value + // depends on the number of active lanes. If there + // are an even number of active lanes, then the XOR + // will result in 0. And if there are an odd number + // of Active lanes then the XOR will result in the + // same value as that in the SGPR. This comes from + // the fact that A^A = 0 and A^0 = A. + + Register ParityRegister = MRI.createVirtualRegister(DstRegClass); + + auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(1); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityReg->getOperand(0).getReg()) ; + break; + } + case AMDGPU::S_SUB_I32:{ + Register NegatedVal = MRI.createVirtualRegister(DstRegClass); + + // Take the negation of the source operand. + auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(InvertedValReg->getOperand(0).getReg()) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::S_ADD_I32:{ + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + } + RetBB = &BB; + } + } } else { // TODO: Implement DPP Strategy and switch based on immediate strategy // operand. For now, for all the cases (default, Iterative and DPP we use @@ -4900,9 +4970,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Create initail values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlockk - uint32_t InitalValue = - (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits::max() : 0; + // insert branch instr to newly created ComputeBlock + uint32_t InitalValue; + switch(Opc){ + case AMDGPU::S_MIN_U32: + InitalValue = std::numeric_limits::max(); + break; + case AMDGPU::S_MIN_I32: + InitalValue = std::numeric_limits::max(); + break; + case AMDGPU::S_MAX_U32: + InitalValue = 0; + break; + case AMDGPU::S_MAX_I32: + InitalValue = std::numeric_limits::min(); + break; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + InitalValue = 0x00000000; + break; + case AMDGPU::S_AND_B32: + InitalValue = 0xFFFFFFFF; + } auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) @@ -4968,10 +5059,24 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( SIMachineFunctionInfo *MFI = MF->getInfo(); switch (MI.getOpcode()) { - case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); - case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); @@ -6859,7 +6964,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 + // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9afb29d95abd7d..08a489e549bd63 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -255,15 +255,50 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { - def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { } - def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { } + + def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> { + } } let usesCustomInserter = 1, Defs = [VCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index d1e50bd560cb23..fe47715681c389 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) # ; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 ; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] ; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP14]] ; IR-ITERATIVE: 14: ; IR-ITERATIVE-NEXT: ret void @@ -151,12 +151,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) # ; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 ; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] ; IR-DPP: 12: -; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 ; IR-DPP-NEXT: br label [[TMP14]] ; IR-DPP: 14: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll new file mode 100644 index 00000000000000..20e8b76aeed16b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -0,0 +1,1280 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.add.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll new file mode 100644 index 00000000000000..f70d9b476e830d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll @@ -0,0 +1,1032 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.and.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_nop 0 +; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_nop 0 +; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, -1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, -1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, -1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_and_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_and_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll new file mode 100644 index 00000000000000..f38fb59109443c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -0,0 +1,1031 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.max.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_nop 0 +; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_nop 0 +; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s4, 1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s4, 1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s4, 1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s2, 1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s2, 1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s4, 1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s2, 1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s2, 1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_brev_b32 s6, 1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_brev_b32 s6, 1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s0, 1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_brev_b32 s6, 1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s0, 1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll new file mode 100644 index 00000000000000..20a7fd25bb729f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -0,0 +1,1031 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.min.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_nop 0 +; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_nop 0 +; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s4, -2 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s4, -2 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s4, -2 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s2, -2 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s2, -2 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s4, -2 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s2, -2 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s2, -2 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_brev_b32 s6, -2 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_brev_b32 s6, -2 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_brev_b32 s6, -2 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s0, -2 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_brev_b32 s6, -2 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s0, -2 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll new file mode 100644 index 00000000000000..064540ec540732 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll @@ -0,0 +1,1031 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.or.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_nop 0 +; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_nop 0 +; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_or_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_or_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll new file mode 100644 index 00000000000000..422b688324cd71 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -0,0 +1,1329 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.sub.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s3, s4, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s5, s4 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX8GISEL-NEXT: s_mul_i32 s6, s5, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s4, s5, s4 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX9GISEL-NEXT: s_mul_i32 s6, s5, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s5, s4 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s6, s5, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s5, s4 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s5, s6, -1 +; GFX1164GISEL-NEXT: s_mul_i32 s6, s5, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll new file mode 100644 index 00000000000000..f04123634a5df6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll @@ -0,0 +1,1333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.xor.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s6, s4 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_nop 0 +; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s4 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_nop 0 +; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132DAGISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_nop 0 +; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_nop 0 +; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir new file mode 100644 index 00000000000000..6c0e2bb93afd58 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir @@ -0,0 +1,90 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_BCNT1_I32_B64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir new file mode 100644 index 00000000000000..72b485719c9cd4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir new file mode 100644 index 00000000000000..17d9a6daac9268 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483648 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_MAX_I32_:%[0-9]+]]:sgpr_32 = S_MAX_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MAX_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir new file mode 100644 index 00000000000000..45a6d248e834aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483647 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_MIN_I32_:%[0-9]+]]:sgpr_32 = S_MIN_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MIN_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir new file mode 100644 index 00000000000000..1f23509cb4002b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir new file mode 100644 index 00000000000000..6af5f2e9b5a17a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 -1, [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_MUL_I32_]], [[S_BCNT1_I32_B64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_1]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_SUB_I32_:%[0-9]+]]:sgpr_32 = S_SUB_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir index 179c9f4f8dc4d0..2983e646208dd6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir @@ -26,7 +26,7 @@ body: | %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 - %7:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 killed %6, 1, implicit $exec + %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_U32 killed %6, 1, implicit $exec %8:vgpr_32 = COPY %7 GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec S_ENDPGM 0 @@ -79,7 +79,7 @@ body: | %0:vgpr_32 = COPY $vgpr0 %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %6:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 %0, 1, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_U32 %0, 1, implicit $exec %7:vgpr_32 = COPY %6 GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir index 88c35a6417d237..db698a3b29371a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir @@ -26,7 +26,7 @@ body: | %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 - %7:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 killed %6, 1, implicit $exec + %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_U32 killed %6, 1, implicit $exec %8:vgpr_32 = COPY %7 GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec S_ENDPGM 0 @@ -79,7 +79,7 @@ body: | %0:vgpr_32 = COPY $vgpr0 %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %6:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 %0, 1, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_U32 %0, 1, implicit $exec %7:vgpr_32 = COPY %6 GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir new file mode 100644 index 00000000000000..54ac7ceb6a2c1b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[S_BCNT1_I32_B64_]], 1, implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_AND_B32_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +...