Skip to content

Commit

Permalink
[AMDGPU] Extend type support for update_dpp intrinsic (#114597)
Browse files Browse the repository at this point in the history
We can split 64-bit DPP as a post-RA pseudo if control values are
supported, but cannot handle other types.
  • Loading branch information
rampitec authored Nov 5, 2024
1 parent dccb1fe commit 6d7e51d
Show file tree
Hide file tree
Showing 12 changed files with 6,178 additions and 6,104 deletions.
47 changes: 29 additions & 18 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5495,6 +5495,13 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
case Intrinsic::amdgcn_mov_dpp8:
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
case Intrinsic::amdgcn_update_dpp:
return LaneOp.addUse(Src1)
.addImm(MI.getOperand(4).getImm())
.addImm(MI.getOperand(5).getImm())
.addImm(MI.getOperand(6).getImm())
.addImm(MI.getOperand(7).getImm())
.getReg(0);
default:
llvm_unreachable("unhandled lane op");
}
Expand All @@ -5504,7 +5511,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsSetInactive || IsPermLane16) {
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
Expand All @@ -5514,15 +5521,21 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
LLT Ty = MRI.getType(DstReg);
unsigned Size = Ty.getSizeInBits();

if (Size == 32) {
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
ST.hasDPALU_DPP() &&
AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
SplitSize = 64;

if (Size == SplitSize) {
// Already legal
return true;
}

if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);

if (IsSetInactive || IsPermLane16)
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);

if (IID == Intrinsic::amdgcn_writelane)
Expand All @@ -5534,31 +5547,28 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
return true;
}

if (Size % 32 != 0)
if (Size % SplitSize != 0)
return false;

LLT PartialResTy = S32;
LLT PartialResTy = LLT::scalar(SplitSize);
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
switch (EltTy.getSizeInBits()) {
case 16:
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
break;
case 32:
unsigned EltSize = EltTy.getSizeInBits();
if (EltSize == SplitSize) {
PartialResTy = EltTy;
break;
default:
// Handle all other cases via S32 pieces;
break;
} else if (EltSize == 16 || EltSize == 32) {
unsigned NElem = SplitSize / EltSize;
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
}
// Handle all other cases via S32/S64 pieces;
}

SmallVector<Register, 2> PartialRes;
unsigned NumParts = Size / 32;
SmallVector<Register, 4> PartialRes;
unsigned NumParts = Size / SplitSize;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;

if (IsSetInactive || IsPermLane16)
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Expand All @@ -5567,7 +5577,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);

if (IsSetInactive || IsPermLane16)
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1 = Src1Parts.getReg(i);

if (IID == Intrinsic::amdgcn_writelane)
Expand Down Expand Up @@ -7555,6 +7565,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
case Intrinsic::amdgcn_update_dpp:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
Expand Down
48 changes: 32 additions & 16 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6162,13 +6162,20 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
const GCNSubtarget *ST = TLI.getSubtarget();
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
ST->hasDPALU_DPP() &&
AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
SplitSize = 64;

auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
switch (IID) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_update_dpp:
Operands.push_back(N->getOperand(6));
Operands.push_back(N->getOperand(5));
Operands.push_back(N->getOperand(4));
Expand Down Expand Up @@ -6206,13 +6213,15 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
IID == Intrinsic::amdgcn_mov_dpp8 ||
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
if (IID == Intrinsic::amdgcn_writelane ||
IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
Src2 = N->getOperand(3);
}

if (ValSize == 32) {
if (ValSize == SplitSize) {
// Already legal
return SDValue();
}
Expand All @@ -6222,7 +6231,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);

if (IsSetInactive || IsPermLane16) {
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}
Expand All @@ -6237,7 +6246,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
}

if (ValSize % 32 != 0)
if (ValSize % SplitSize != 0)
return SDValue();

auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
Expand Down Expand Up @@ -6284,21 +6293,26 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
switch (MVT::SimpleValueType EltTy =
VT.getVectorElementType().getSimpleVT().SimpleTy) {
case MVT::i32:
case MVT::f32: {
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
return unrollLaneOp(LaneOp.getNode());
}
case MVT::f32:
if (SplitSize == 32) {
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
return unrollLaneOp(LaneOp.getNode());
}
[[fallthrough]];
case MVT::i16:
case MVT::f16:
case MVT::bf16: {
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
unsigned SubVecNumElt =
SplitSize / VT.getVectorElementType().getSizeInBits();
MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
SmallVector<SDValue, 4> Pieces;
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));

if (IsSetInactive || IsPermLane16)
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));

Expand All @@ -6307,10 +6321,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(EltIdx, SL, MVT::i32));

Pieces.push_back(
IsSetInactive || IsPermLane16
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
EltIdx += SubVecNumElt;
}
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
}
Expand All @@ -6320,10 +6334,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
}
}

MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
MVT VecVT =
MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
Src0 = DAG.getBitcast(VecVT, Src0);

if (IsSetInactive || IsPermLane16)
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Expand Down Expand Up @@ -8833,6 +8848,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
case Intrinsic::amdgcn_update_dpp:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
Expand All @@ -77,10 +77,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -106,11 +106,11 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
Expand All @@ -131,10 +131,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -160,11 +160,11 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
Expand All @@ -185,10 +185,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX11-LABEL: update_dppv2i32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -214,11 +214,11 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
Expand All @@ -239,10 +239,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX11-LABEL: update_dppv2f32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -268,11 +268,11 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
Expand All @@ -293,10 +293,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand Down
Loading

0 comments on commit 6d7e51d

Please sign in to comment.