From a779af3f882e1500ee6b9cb973e60c10ea0819b4 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sat, 8 Mar 2025 06:28:20 -0800 Subject: [PATCH] [AMDGPU] Change SGPR layout to striped caller/callee saved (#127353) This PR updates the SGPR layout to a striped caller/callee-saved design, similar to the VGPR layout. To ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer), and s34 (base pointer) remain callee-saved, the striped layout starts from s40, with a stripe width of 8. The last stripe is 10 wide instead of 8 to avoid ending with a 2-wide stripe. Fixes #113782. --- llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 10 +- .../amdgpu-simplify-libcall-pow-codegen.ll | 230 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 335 +-- ...der-no-live-segment-at-def-implicit-def.ll | 42 +- .../branch-folding-implicit-def-subreg.ll | 381 +-- .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 205 +- ...l-args-inreg-no-sgpr-for-csrspill-xfail.ll | 4 +- llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 12 +- .../CodeGen/AMDGPU/call-argument-types.ll | 48 +- .../AMDGPU/call-preserved-registers.ll | 34 +- .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 2235 +++++---------- .../AMDGPU/csr-sgpr-spill-live-ins.mir | 8 - llvm/test/CodeGen/AMDGPU/ds_read2.ll | 14 +- .../AMDGPU/dwarf-multi-register-use-crash.ll | 60 +- .../eliminate-frame-index-s-mov-b32.mir | 53 +- .../CodeGen/AMDGPU/function-args-inreg.ll | 4 +- .../CodeGen/AMDGPU/function-resource-usage.ll | 22 +- .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 68 +- .../AMDGPU/gfx-callable-argument-types.ll | 256 +- .../AMDGPU/global_atomics_scan_fadd.ll | 2492 ++++++++--------- .../AMDGPU/global_atomics_scan_fmax.ll | 2428 ++++++++-------- .../AMDGPU/global_atomics_scan_fmin.ll | 2428 ++++++++-------- .../AMDGPU/global_atomics_scan_fsub.ll | 2492 ++++++++--------- .../greedy-alloc-fail-sgpr1024-spill.mir | 124 +- .../identical-subrange-spill-infloop.ll | 125 +- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 1080 +++---- llvm/test/CodeGen/AMDGPU/issue48473.mir | 2 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 416 +-- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 45 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 81 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 45 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 81 +- ...ne-sink-temporal-divergence-swdev407790.ll | 284 +- .../materialize-frame-index-sgpr.gfx10.ll | 842 +----- .../AMDGPU/materialize-frame-index-sgpr.ll | 1415 +++------- ...-knownbits-assign-crash-gh-issue-110930.ll | 26 +- .../AMDGPU/pei-scavenge-sgpr-carry-out.mir | 86 +- .../CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir | 56 +- .../test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir | 30 +- .../AMDGPU/promote-constOffset-to-imm.ll | 48 +- .../ran-out-of-sgprs-allocation-failure.mir | 217 +- .../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 8 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 15 +- .../AMDGPU/shufflevector.v2i64.v8i64.ll | 1792 +++--------- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 204 +- .../AMDGPU/snippet-copy-bundle-regression.mir | 27 +- .../AMDGPU/spill-sgpr-to-virtual-vgpr.mir | 34 +- .../AMDGPU/spill-sgpr-used-for-exec-copy.mir | 11 +- .../spill_more_than_wavesize_csr_sgprs.ll | 380 +-- .../CodeGen/AMDGPU/splitkit-copy-bundle.mir | 200 +- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 428 +-- .../AMDGPU/tuple-allocation-failure.ll | 313 ++- .../unallocatable-bundle-regression.mir | 48 +- .../AMDGPU/unstructured-cfg-def-use-issue.ll | 188 +- .../CodeGen/AMDGPU/use_restore_frame_reg.mir | 76 +- .../AMDGPU/vgpr-large-tuple-alloc-error.ll | 320 +-- .../CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir | 3 +- 57 files changed, 9334 insertions(+), 13577 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 80969fce3d77f..e891fdba4e03e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -91,7 +91,15 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs< >; def CSR_AMDGPU_SGPRs : CalleeSavedRegs< - (sequence "SGPR%u", 30, 105) + // Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer), + // and s34 (base pointer) are callee-saved. The striped layout starts from s40, + // with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid + // ending with a 2-wide stripe. + (add (sequence "SGPR%u", 30, 39), + (sequence "SGPR%u", 48, 55), + (sequence "SGPR%u", 64, 71), + (sequence "SGPR%u", 80, 87), + (sequence "SGPR%u", 96, 105)) >; def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs< diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 54b4888120e5f..5bda853b76727 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -127,29 +127,29 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v43, s48, 8 +; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: v_writelane_b32 v43, s50, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_writelane_b32 v43, s51, 11 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_writelane_b32 v43, s52, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: v_writelane_b32 v43, s53, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -160,15 +160,15 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -178,12 +178,12 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s53, v43, 13 +; CHECK-NEXT: v_readlane_b32 s52, v43, 12 +; CHECK-NEXT: v_readlane_b32 s51, v43, 11 +; CHECK-NEXT: v_readlane_b32 s50, v43, 10 +; CHECK-NEXT: v_readlane_b32 s49, v43, 9 +; CHECK-NEXT: v_readlane_b32 s48, v43, 8 ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 @@ -267,28 +267,28 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v43, s48, 8 +; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v43, s50, 10 +; CHECK-NEXT: v_writelane_b32 v43, s51, 11 +; CHECK-NEXT: v_writelane_b32 v43, s52, 12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: v_writelane_b32 v43, s53, 13 ; CHECK-NEXT: v_mov_b32_e32 v42, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v3 ; CHECK-NEXT: v_mov_b32_e32 v40, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -299,26 +299,26 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s53, v43, 13 +; CHECK-NEXT: v_readlane_b32 s52, v43, 12 +; CHECK-NEXT: v_readlane_b32 s51, v43, 11 +; CHECK-NEXT: v_readlane_b32 s50, v43, 10 +; CHECK-NEXT: v_readlane_b32 s49, v43, 9 +; CHECK-NEXT: v_readlane_b32 s48, v43, 8 ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 @@ -409,29 +409,29 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v43, s48, 8 +; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: v_writelane_b32 v43, s50, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_writelane_b32 v43, s51, 11 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_writelane_b32 v43, s52, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: v_writelane_b32 v43, s53, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -442,15 +442,15 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -460,12 +460,12 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s53, v43, 13 +; CHECK-NEXT: v_readlane_b32 s52, v43, 12 +; CHECK-NEXT: v_readlane_b32 s51, v43, 11 +; CHECK-NEXT: v_readlane_b32 s50, v43, 10 +; CHECK-NEXT: v_readlane_b32 s49, v43, 9 +; CHECK-NEXT: v_readlane_b32 s48, v43, 8 ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 @@ -551,26 +551,26 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: v_writelane_b32 v42, s38, 6 ; CHECK-NEXT: v_writelane_b32 v42, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v42, s40, 8 -; CHECK-NEXT: v_writelane_b32 v42, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v42, s48, 8 +; CHECK-NEXT: v_writelane_b32 v42, s49, 9 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v42, s42, 10 -; CHECK-NEXT: v_writelane_b32 v42, s43, 11 -; CHECK-NEXT: v_writelane_b32 v42, s44, 12 +; CHECK-NEXT: v_writelane_b32 v42, s50, 10 +; CHECK-NEXT: v_writelane_b32 v42, s51, 11 +; CHECK-NEXT: v_writelane_b32 v42, s52, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v42, s45, 13 +; CHECK-NEXT: v_writelane_b32 v42, s53, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -582,26 +582,26 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v42, 13 -; CHECK-NEXT: v_readlane_b32 s44, v42, 12 -; CHECK-NEXT: v_readlane_b32 s43, v42, 11 -; CHECK-NEXT: v_readlane_b32 s42, v42, 10 -; CHECK-NEXT: v_readlane_b32 s41, v42, 9 -; CHECK-NEXT: v_readlane_b32 s40, v42, 8 +; CHECK-NEXT: v_readlane_b32 s53, v42, 13 +; CHECK-NEXT: v_readlane_b32 s52, v42, 12 +; CHECK-NEXT: v_readlane_b32 s51, v42, 11 +; CHECK-NEXT: v_readlane_b32 s50, v42, 10 +; CHECK-NEXT: v_readlane_b32 s49, v42, 9 +; CHECK-NEXT: v_readlane_b32 s48, v42, 8 ; CHECK-NEXT: v_readlane_b32 s39, v42, 7 ; CHECK-NEXT: v_readlane_b32 s38, v42, 6 ; CHECK-NEXT: v_readlane_b32 s37, v42, 5 @@ -692,28 +692,28 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v43, s48, 8 +; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: v_writelane_b32 v43, s50, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_writelane_b32 v43, s51, 11 ; CHECK-NEXT: v_mov_b32_e32 v41, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_writelane_b32 v43, s52, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: v_writelane_b32 v43, s53, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -725,15 +725,15 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -742,12 +742,12 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s53, v43, 13 +; CHECK-NEXT: v_readlane_b32 s52, v43, 12 +; CHECK-NEXT: v_readlane_b32 s51, v43, 11 +; CHECK-NEXT: v_readlane_b32 s50, v43, 10 +; CHECK-NEXT: v_readlane_b32 s49, v43, 9 +; CHECK-NEXT: v_readlane_b32 s48, v43, 8 ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index aafdb1c8cc36f..91598496eb984 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -40671,14 +40671,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-LABEL: v_vselect_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v31, s30, 0 -; GCN-NEXT: v_writelane_b32 v31, s31, 1 -; GCN-NEXT: v_writelane_b32 v31, s34, 2 -; GCN-NEXT: v_writelane_b32 v31, s35, 3 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v1 @@ -40716,21 +40708,21 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_and_b32_e32 v8, 1, v14 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v8 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_and_b32_e32 v9, 1, v15 -; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9 +; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v9 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35] +; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[42:43] ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31] +; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[40:41] ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -40806,14 +40798,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_readlane_b32 s35, v31, 3 -; GCN-NEXT: v_readlane_b32 s34, v31, 2 -; GCN-NEXT: v_readlane_b32 s31, v31, 1 -; GCN-NEXT: v_readlane_b32 s30, v31, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v16bf16: @@ -40954,9 +40938,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-LABEL: v_vselect_v16bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 @@ -40982,17 +40963,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_writelane_b32 v31, s34, 2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_writelane_b32 v31, s35, 3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] @@ -41018,9 +40995,9 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[40:41] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[42:43] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] @@ -41043,14 +41020,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s35, v31, 3 -; GFX8-NEXT: v_readlane_b32 s34, v31, 2 -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v16bf16: @@ -42030,108 +41999,80 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v34, s30, 0 -; GFX8-NEXT: v_writelane_b32 v34, s31, 1 -; GFX8-NEXT: v_writelane_b32 v34, s34, 2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_writelane_b32 v34, s35, 3 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_writelane_b32 v34, s36, 4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_writelane_b32 v34, s37, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_writelane_b32 v34, s38, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_writelane_b32 v34, s39, 7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX8-NEXT: v_writelane_b32 v34, s40, 8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_writelane_b32 v34, s41, 9 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX8-NEXT: v_writelane_b32 v34, s42, 10 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX8-NEXT: v_writelane_b32 v34, s43, 11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX8-NEXT: v_writelane_b32 v34, s44, 12 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX8-NEXT: v_writelane_b32 v34, s45, 13 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX8-NEXT: v_writelane_b32 v34, s46, 14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_writelane_b32 v34, s47, 15 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_writelane_b32 v34, s48, 16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_writelane_b32 v34, s49, 17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_writelane_b32 v34, s50, 18 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX8-NEXT: v_writelane_b32 v34, s51, 19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX8-NEXT: v_writelane_b32 v34, s52, 20 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX8-NEXT: v_writelane_b32 v34, s53, 21 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX8-NEXT: v_writelane_b32 v34, s54, 22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX8-NEXT: v_writelane_b32 v34, s55, 23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX8-NEXT: v_writelane_b32 v34, s56, 24 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX8-NEXT: v_writelane_b32 v34, s57, 25 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX8-NEXT: v_writelane_b32 v34, s58, 26 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX8-NEXT: v_writelane_b32 v34, s59, 27 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX8-NEXT: v_writelane_b32 v34, s60, 28 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX8-NEXT: v_writelane_b32 v34, s61, 29 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX8-NEXT: v_writelane_b32 v34, s62, 30 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s34, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX8-NEXT: v_writelane_b32 v34, s63, 31 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s35, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX8-NEXT: v_writelane_b32 v34, s64, 32 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s36, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX8-NEXT: v_writelane_b32 v34, s65, 33 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 +; GFX8-NEXT: v_writelane_b32 v34, s37, 5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v34, s66, 34 -; GFX8-NEXT: v_writelane_b32 v34, s67, 35 +; GFX8-NEXT: v_writelane_b32 v34, s38, 6 +; GFX8-NEXT: v_writelane_b32 v34, s39, 7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 @@ -42168,40 +42109,40 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[38:39] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[36:37] ; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61] +; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[30:31] ; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59] -; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[90:91] +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] ; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55] -; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] +; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[78:79] +; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] ; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] +; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[74:75] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] ; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] ; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43] -; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] @@ -42262,34 +42203,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s67, v34, 35 -; GFX8-NEXT: v_readlane_b32 s66, v34, 34 -; GFX8-NEXT: v_readlane_b32 s65, v34, 33 -; GFX8-NEXT: v_readlane_b32 s64, v34, 32 -; GFX8-NEXT: v_readlane_b32 s63, v34, 31 -; GFX8-NEXT: v_readlane_b32 s62, v34, 30 -; GFX8-NEXT: v_readlane_b32 s61, v34, 29 -; GFX8-NEXT: v_readlane_b32 s60, v34, 28 -; GFX8-NEXT: v_readlane_b32 s59, v34, 27 -; GFX8-NEXT: v_readlane_b32 s58, v34, 26 -; GFX8-NEXT: v_readlane_b32 s57, v34, 25 -; GFX8-NEXT: v_readlane_b32 s56, v34, 24 -; GFX8-NEXT: v_readlane_b32 s55, v34, 23 -; GFX8-NEXT: v_readlane_b32 s54, v34, 22 -; GFX8-NEXT: v_readlane_b32 s53, v34, 21 -; GFX8-NEXT: v_readlane_b32 s52, v34, 20 -; GFX8-NEXT: v_readlane_b32 s51, v34, 19 -; GFX8-NEXT: v_readlane_b32 s50, v34, 18 -; GFX8-NEXT: v_readlane_b32 s49, v34, 17 -; GFX8-NEXT: v_readlane_b32 s48, v34, 16 -; GFX8-NEXT: v_readlane_b32 s47, v34, 15 -; GFX8-NEXT: v_readlane_b32 s46, v34, 14 -; GFX8-NEXT: v_readlane_b32 s45, v34, 13 -; GFX8-NEXT: v_readlane_b32 s44, v34, 12 -; GFX8-NEXT: v_readlane_b32 s43, v34, 11 -; GFX8-NEXT: v_readlane_b32 s42, v34, 10 -; GFX8-NEXT: v_readlane_b32 s41, v34, 9 -; GFX8-NEXT: v_readlane_b32 s40, v34, 8 ; GFX8-NEXT: v_readlane_b32 s39, v34, 7 ; GFX8-NEXT: v_readlane_b32 s38, v34, 6 ; GFX8-NEXT: v_readlane_b32 s37, v34, 5 @@ -42310,108 +42223,76 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 -; GFX9-NEXT: v_writelane_b32 v33, s34, 2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_writelane_b32 v33, s35, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_writelane_b32 v33, s36, 4 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_writelane_b32 v33, s37, 5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_writelane_b32 v33, s38, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_writelane_b32 v33, s39, 7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_writelane_b32 v33, s40, 8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_writelane_b32 v33, s41, 9 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_writelane_b32 v33, s42, 10 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_writelane_b32 v33, s43, 11 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_writelane_b32 v33, s44, 12 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_writelane_b32 v33, s45, 13 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_writelane_b32 v33, s46, 14 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_writelane_b32 v33, s47, 15 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_writelane_b32 v33, s48, 16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_writelane_b32 v33, s49, 17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_writelane_b32 v33, s50, 18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_writelane_b32 v33, s51, 19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_writelane_b32 v33, s52, 20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_writelane_b32 v33, s53, 21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_writelane_b32 v33, s54, 22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_writelane_b32 v33, s55, 23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_writelane_b32 v33, s56, 24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_writelane_b32 v33, s57, 25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_writelane_b32 v33, s58, 26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_writelane_b32 v33, s59, 27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_writelane_b32 v33, s60, 28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_writelane_b32 v33, s61, 29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_writelane_b32 v33, s62, 30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_writelane_b32 v33, s63, 31 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v33, s64, 32 -; GFX9-NEXT: v_writelane_b32 v33, s65, 33 -; GFX9-NEXT: v_writelane_b32 v33, s66, 34 +; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v33, s34, 2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v33, s67, 35 +; GFX9-NEXT: v_writelane_b32 v33, s35, 3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 @@ -42445,42 +42326,42 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67] +; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[34:35] ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63] +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[94:95] ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[58:59] +; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[92:93] +; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[90:91] ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[54:55] +; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[50:51] +; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[46:47] +; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[42:43] +; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[38:39] +; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] ; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -42526,38 +42407,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 ; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 ; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 -; GFX9-NEXT: v_readlane_b32 s67, v33, 35 -; GFX9-NEXT: v_readlane_b32 s66, v33, 34 -; GFX9-NEXT: v_readlane_b32 s65, v33, 33 -; GFX9-NEXT: v_readlane_b32 s64, v33, 32 -; GFX9-NEXT: v_readlane_b32 s63, v33, 31 -; GFX9-NEXT: v_readlane_b32 s62, v33, 30 -; GFX9-NEXT: v_readlane_b32 s61, v33, 29 -; GFX9-NEXT: v_readlane_b32 s60, v33, 28 -; GFX9-NEXT: v_readlane_b32 s59, v33, 27 -; GFX9-NEXT: v_readlane_b32 s58, v33, 26 -; GFX9-NEXT: v_readlane_b32 s57, v33, 25 -; GFX9-NEXT: v_readlane_b32 s56, v33, 24 -; GFX9-NEXT: v_readlane_b32 s55, v33, 23 -; GFX9-NEXT: v_readlane_b32 s54, v33, 22 -; GFX9-NEXT: v_readlane_b32 s53, v33, 21 -; GFX9-NEXT: v_readlane_b32 s52, v33, 20 -; GFX9-NEXT: v_readlane_b32 s51, v33, 19 -; GFX9-NEXT: v_readlane_b32 s50, v33, 18 -; GFX9-NEXT: v_readlane_b32 s49, v33, 17 -; GFX9-NEXT: v_readlane_b32 s48, v33, 16 -; GFX9-NEXT: v_readlane_b32 s47, v33, 15 -; GFX9-NEXT: v_readlane_b32 s46, v33, 14 -; GFX9-NEXT: v_readlane_b32 s45, v33, 13 -; GFX9-NEXT: v_readlane_b32 s44, v33, 12 -; GFX9-NEXT: v_readlane_b32 s43, v33, 11 -; GFX9-NEXT: v_readlane_b32 s42, v33, 10 -; GFX9-NEXT: v_readlane_b32 s41, v33, 9 -; GFX9-NEXT: v_readlane_b32 s40, v33, 8 -; GFX9-NEXT: v_readlane_b32 s39, v33, 7 -; GFX9-NEXT: v_readlane_b32 s38, v33, 6 -; GFX9-NEXT: v_readlane_b32 s37, v33, 5 -; GFX9-NEXT: v_readlane_b32 s36, v33, 4 ; GFX9-NEXT: v_readlane_b32 s35, v33, 3 ; GFX9-NEXT: v_readlane_b32 s34, v33, 2 ; GFX9-NEXT: v_readlane_b32 s31, v33, 1 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index f9ffa5ae57f3e..36fa7b97b3c77 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -9,24 +9,24 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b32 s12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s40, 0 +; CHECK-NEXT: s_cmp_lg_u32 s52, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_8 ; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i -; CHECK-NEXT: s_cmp_eq_u32 s42, 0 +; CHECK-NEXT: s_cmp_eq_u32 s54, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 ; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i -; CHECK-NEXT: s_cmp_lg_u32 s43, 0 +; CHECK-NEXT: s_cmp_lg_u32 s55, 0 ; CHECK-NEXT: s_mov_b32 s17, 0 ; CHECK-NEXT: s_cselect_b32 s12, -1, 0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: s_branch .LBB0_7 @@ -34,16 +34,16 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_mov_b32 s14, s12 ; CHECK-NEXT: s_mov_b32 s15, s12 ; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15] -; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13] +; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] +; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i -; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0 -; CHECK-NEXT: s_mov_b32 s36, 1.0 +; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 +; CHECK-NEXT: s_mov_b32 s48, 1.0 ; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000 -; CHECK-NEXT: s_mov_b32 s37, s36 -; CHECK-NEXT: s_mov_b32 s38, s36 -; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s50, s48 +; CHECK-NEXT: s_mov_b32 s51, s48 ; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 ; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i @@ -55,7 +55,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1 -; CHECK-NEXT: v_add_f32_e64 v1, s17, s36 +; CHECK-NEXT: v_add_f32_e64 v1, s17, s48 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] ; CHECK-NEXT: s_mov_b32 s12, s14 @@ -65,13 +65,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s16 -; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35] -; CHECK-NEXT: s_mov_b32 s37, s36 -; CHECK-NEXT: s_mov_b32 s38, s36 -; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s50, s48 +; CHECK-NEXT: s_mov_b32 s51, s48 ; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 @@ -80,11 +80,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 -; CHECK-NEXT: v_mov_b32_e32 v0, s36 +; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s37 -; CHECK-NEXT: v_mov_b32_e32 v2, s38 -; CHECK-NEXT: v_mov_b32_e32 v3, s39 +; CHECK-NEXT: v_mov_b32_e32 v1, s49 +; CHECK-NEXT: v_mov_b32_e32 v2, s50 +; CHECK-NEXT: v_mov_b32_e32 v3, s51 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 98136347ab702..e43a021802644 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 @@ -33,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -41,7 +41,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr44, $sgpr45, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -54,7 +54,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -62,7 +62,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec @@ -79,15 +79,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -111,7 +111,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr17, implicit $exec @@ -124,15 +124,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec @@ -141,58 +141,58 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc @@ -202,113 +202,113 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr66_sgpr67, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr64_sgpr65, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.20, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.22, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.24, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -317,7 +317,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -325,17 +325,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr68_sgpr69, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr68_sgpr69, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -351,32 +351,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -400,19 +400,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr46_sgpr47, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -437,33 +440,34 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr44_sgpr45, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -487,38 +491,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -539,41 +542,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr57, $vgpr63, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr63, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -587,36 +590,36 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec @@ -640,51 +643,51 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -701,20 +704,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -730,13 +733,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 @@ -756,14 +759,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec @@ -772,12 +775,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr44, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr45, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 @@ -788,15 +791,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -821,7 +824,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr44_sgpr45:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -840,13 +843,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45:0x000000000000000F, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -857,31 +860,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr52_sgpr53, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -889,14 +892,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -905,21 +908,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec @@ -938,7 +941,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec @@ -954,14 +957,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec @@ -990,14 +993,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index 05c2e0077f4ae..83ab6c32aee96 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -900,9 +900,8 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 ; CHECK-NEXT: v_writelane_b32 v0, s33, 2 @@ -912,71 +911,38 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v0, s37, 6 ; CHECK-NEXT: v_writelane_b32 v0, s38, 7 ; CHECK-NEXT: v_writelane_b32 v0, s39, 8 -; CHECK-NEXT: v_writelane_b32 v0, s40, 9 -; CHECK-NEXT: v_writelane_b32 v0, s41, 10 -; CHECK-NEXT: v_writelane_b32 v0, s42, 11 -; CHECK-NEXT: v_writelane_b32 v0, s43, 12 -; CHECK-NEXT: v_writelane_b32 v0, s44, 13 -; CHECK-NEXT: v_writelane_b32 v0, s45, 14 -; CHECK-NEXT: v_writelane_b32 v0, s46, 15 -; CHECK-NEXT: v_writelane_b32 v0, s47, 16 -; CHECK-NEXT: v_writelane_b32 v0, s48, 17 -; CHECK-NEXT: v_writelane_b32 v0, s49, 18 -; CHECK-NEXT: v_writelane_b32 v0, s50, 19 -; CHECK-NEXT: v_writelane_b32 v0, s51, 20 -; CHECK-NEXT: v_writelane_b32 v0, s52, 21 -; CHECK-NEXT: v_writelane_b32 v0, s53, 22 -; CHECK-NEXT: v_writelane_b32 v0, s54, 23 -; CHECK-NEXT: v_writelane_b32 v0, s55, 24 -; CHECK-NEXT: v_writelane_b32 v0, s56, 25 -; CHECK-NEXT: v_writelane_b32 v0, s57, 26 -; CHECK-NEXT: v_writelane_b32 v0, s58, 27 -; CHECK-NEXT: v_writelane_b32 v0, s59, 28 -; CHECK-NEXT: v_writelane_b32 v0, s60, 29 -; CHECK-NEXT: v_writelane_b32 v0, s61, 30 -; CHECK-NEXT: v_writelane_b32 v0, s62, 31 -; CHECK-NEXT: v_writelane_b32 v0, s63, 32 -; CHECK-NEXT: v_writelane_b32 v0, s64, 33 -; CHECK-NEXT: v_writelane_b32 v0, s65, 34 -; CHECK-NEXT: v_writelane_b32 v0, s66, 35 -; CHECK-NEXT: v_writelane_b32 v0, s67, 36 -; CHECK-NEXT: v_writelane_b32 v0, s68, 37 -; CHECK-NEXT: v_writelane_b32 v0, s69, 38 -; CHECK-NEXT: v_writelane_b32 v0, s70, 39 -; CHECK-NEXT: v_writelane_b32 v0, s71, 40 -; CHECK-NEXT: v_writelane_b32 v0, s72, 41 -; CHECK-NEXT: v_writelane_b32 v0, s73, 42 -; CHECK-NEXT: v_writelane_b32 v0, s74, 43 -; CHECK-NEXT: v_writelane_b32 v0, s75, 44 -; CHECK-NEXT: v_writelane_b32 v0, s76, 45 -; CHECK-NEXT: v_writelane_b32 v0, s77, 46 -; CHECK-NEXT: v_writelane_b32 v0, s78, 47 -; CHECK-NEXT: v_writelane_b32 v0, s79, 48 -; CHECK-NEXT: v_writelane_b32 v0, s80, 49 -; CHECK-NEXT: v_writelane_b32 v0, s81, 50 -; CHECK-NEXT: v_writelane_b32 v0, s82, 51 -; CHECK-NEXT: v_writelane_b32 v0, s83, 52 -; CHECK-NEXT: v_writelane_b32 v0, s84, 53 -; CHECK-NEXT: v_writelane_b32 v0, s85, 54 -; CHECK-NEXT: v_writelane_b32 v0, s86, 55 -; CHECK-NEXT: v_writelane_b32 v0, s87, 56 -; CHECK-NEXT: v_writelane_b32 v0, s88, 57 -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: v_writelane_b32 v1, s95, 0 -; CHECK-NEXT: v_writelane_b32 v0, s89, 58 -; CHECK-NEXT: v_writelane_b32 v1, s96, 1 -; CHECK-NEXT: v_writelane_b32 v0, s90, 59 -; CHECK-NEXT: v_writelane_b32 v1, s97, 2 -; CHECK-NEXT: v_writelane_b32 v0, s91, 60 -; CHECK-NEXT: v_writelane_b32 v1, s98, 3 -; CHECK-NEXT: v_writelane_b32 v0, s92, 61 -; CHECK-NEXT: v_writelane_b32 v1, s99, 4 -; CHECK-NEXT: s_mov_b32 s31, s12 -; CHECK-NEXT: v_writelane_b32 v0, s93, 62 -; CHECK-NEXT: v_writelane_b32 v1, s100, 5 -; CHECK-NEXT: s_cmp_eq_u32 s31, 0 -; CHECK-NEXT: v_writelane_b32 v0, s94, 63 -; CHECK-NEXT: v_writelane_b32 v1, s101, 6 +; CHECK-NEXT: v_writelane_b32 v0, s48, 9 +; CHECK-NEXT: v_writelane_b32 v0, s49, 10 +; CHECK-NEXT: v_writelane_b32 v0, s50, 11 +; CHECK-NEXT: v_writelane_b32 v0, s51, 12 +; CHECK-NEXT: v_writelane_b32 v0, s52, 13 +; CHECK-NEXT: v_writelane_b32 v0, s53, 14 +; CHECK-NEXT: v_writelane_b32 v0, s54, 15 +; CHECK-NEXT: v_writelane_b32 v0, s55, 16 +; CHECK-NEXT: v_writelane_b32 v0, s64, 17 +; CHECK-NEXT: v_writelane_b32 v0, s65, 18 +; CHECK-NEXT: v_writelane_b32 v0, s66, 19 +; CHECK-NEXT: v_writelane_b32 v0, s67, 20 +; CHECK-NEXT: v_writelane_b32 v0, s68, 21 +; CHECK-NEXT: v_writelane_b32 v0, s69, 22 +; CHECK-NEXT: v_writelane_b32 v0, s70, 23 +; CHECK-NEXT: v_writelane_b32 v0, s71, 24 +; CHECK-NEXT: v_writelane_b32 v0, s80, 25 +; CHECK-NEXT: v_writelane_b32 v0, s81, 26 +; CHECK-NEXT: v_writelane_b32 v0, s82, 27 +; CHECK-NEXT: v_writelane_b32 v0, s83, 28 +; CHECK-NEXT: v_writelane_b32 v0, s84, 29 +; CHECK-NEXT: v_writelane_b32 v0, s85, 30 +; CHECK-NEXT: v_writelane_b32 v0, s86, 31 +; CHECK-NEXT: v_writelane_b32 v0, s87, 32 +; CHECK-NEXT: v_writelane_b32 v0, s96, 33 +; CHECK-NEXT: v_writelane_b32 v0, s97, 34 +; CHECK-NEXT: v_writelane_b32 v0, s98, 35 +; CHECK-NEXT: v_writelane_b32 v0, s99, 36 +; CHECK-NEXT: s_mov_b32 s40, s12 +; CHECK-NEXT: v_writelane_b32 v0, s100, 37 +; CHECK-NEXT: s_cmp_eq_u32 s40, 0 +; CHECK-NEXT: v_writelane_b32 v0, s101, 38 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -1292,9 +1258,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_cbranch_scc0 .LBB1_1 ; CHECK-NEXT: ; %bb.3: ; %entry ; CHECK-NEXT: s_not_b64 exec, exec -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CHECK-NEXT: v_writelane_b32 v2, s0, 0 -; CHECK-NEXT: v_writelane_b32 v2, s1, 1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; CHECK-NEXT: v_writelane_b32 v1, s0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s1, 1 ; CHECK-NEXT: s_getpc_b64 s[0:1] ; CHECK-NEXT: .Lpost_getpc1: ; CHECK-NEXT: s_add_u32 s0, s0, (.LBB1_4-.Lpost_getpc1)&4294967295 @@ -1313,9 +1279,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_branch .LBB1_2 ; CHECK-NEXT: .LBB1_4: ; %bb3 -; CHECK-NEXT: v_readlane_b32 s0, v2, 0 -; CHECK-NEXT: v_readlane_b32 s1, v2, 1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; CHECK-NEXT: v_readlane_b32 s0, v1, 0 +; CHECK-NEXT: v_readlane_b32 s1, v1, 1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; CHECK-NEXT: s_not_b64 exec, exec ; CHECK-NEXT: .LBB1_2: ; %bb3 ; CHECK-NEXT: ;;#ASMSTART @@ -1630,68 +1596,36 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use vcc_hi ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s101, v1, 6 -; CHECK-NEXT: v_readlane_b32 s100, v1, 5 -; CHECK-NEXT: v_readlane_b32 s99, v1, 4 -; CHECK-NEXT: v_readlane_b32 s98, v1, 3 -; CHECK-NEXT: v_readlane_b32 s97, v1, 2 -; CHECK-NEXT: v_readlane_b32 s96, v1, 1 -; CHECK-NEXT: v_readlane_b32 s95, v1, 0 -; CHECK-NEXT: v_readlane_b32 s94, v0, 63 -; CHECK-NEXT: v_readlane_b32 s93, v0, 62 -; CHECK-NEXT: v_readlane_b32 s92, v0, 61 -; CHECK-NEXT: v_readlane_b32 s91, v0, 60 -; CHECK-NEXT: v_readlane_b32 s90, v0, 59 -; CHECK-NEXT: v_readlane_b32 s89, v0, 58 -; CHECK-NEXT: v_readlane_b32 s88, v0, 57 -; CHECK-NEXT: v_readlane_b32 s87, v0, 56 -; CHECK-NEXT: v_readlane_b32 s86, v0, 55 -; CHECK-NEXT: v_readlane_b32 s85, v0, 54 -; CHECK-NEXT: v_readlane_b32 s84, v0, 53 -; CHECK-NEXT: v_readlane_b32 s83, v0, 52 -; CHECK-NEXT: v_readlane_b32 s82, v0, 51 -; CHECK-NEXT: v_readlane_b32 s81, v0, 50 -; CHECK-NEXT: v_readlane_b32 s80, v0, 49 -; CHECK-NEXT: v_readlane_b32 s79, v0, 48 -; CHECK-NEXT: v_readlane_b32 s78, v0, 47 -; CHECK-NEXT: v_readlane_b32 s77, v0, 46 -; CHECK-NEXT: v_readlane_b32 s76, v0, 45 -; CHECK-NEXT: v_readlane_b32 s75, v0, 44 -; CHECK-NEXT: v_readlane_b32 s74, v0, 43 -; CHECK-NEXT: v_readlane_b32 s73, v0, 42 -; CHECK-NEXT: v_readlane_b32 s72, v0, 41 -; CHECK-NEXT: v_readlane_b32 s71, v0, 40 -; CHECK-NEXT: v_readlane_b32 s70, v0, 39 -; CHECK-NEXT: v_readlane_b32 s69, v0, 38 -; CHECK-NEXT: v_readlane_b32 s68, v0, 37 -; CHECK-NEXT: v_readlane_b32 s67, v0, 36 -; CHECK-NEXT: v_readlane_b32 s66, v0, 35 -; CHECK-NEXT: v_readlane_b32 s65, v0, 34 -; CHECK-NEXT: v_readlane_b32 s64, v0, 33 -; CHECK-NEXT: v_readlane_b32 s63, v0, 32 -; CHECK-NEXT: v_readlane_b32 s62, v0, 31 -; CHECK-NEXT: v_readlane_b32 s61, v0, 30 -; CHECK-NEXT: v_readlane_b32 s60, v0, 29 -; CHECK-NEXT: v_readlane_b32 s59, v0, 28 -; CHECK-NEXT: v_readlane_b32 s58, v0, 27 -; CHECK-NEXT: v_readlane_b32 s57, v0, 26 -; CHECK-NEXT: v_readlane_b32 s56, v0, 25 -; CHECK-NEXT: v_readlane_b32 s55, v0, 24 -; CHECK-NEXT: v_readlane_b32 s54, v0, 23 -; CHECK-NEXT: v_readlane_b32 s53, v0, 22 -; CHECK-NEXT: v_readlane_b32 s52, v0, 21 -; CHECK-NEXT: v_readlane_b32 s51, v0, 20 -; CHECK-NEXT: v_readlane_b32 s50, v0, 19 -; CHECK-NEXT: v_readlane_b32 s49, v0, 18 -; CHECK-NEXT: v_readlane_b32 s48, v0, 17 -; CHECK-NEXT: v_readlane_b32 s47, v0, 16 -; CHECK-NEXT: v_readlane_b32 s46, v0, 15 -; CHECK-NEXT: v_readlane_b32 s45, v0, 14 -; CHECK-NEXT: v_readlane_b32 s44, v0, 13 -; CHECK-NEXT: v_readlane_b32 s43, v0, 12 -; CHECK-NEXT: v_readlane_b32 s42, v0, 11 -; CHECK-NEXT: v_readlane_b32 s41, v0, 10 -; CHECK-NEXT: v_readlane_b32 s40, v0, 9 +; CHECK-NEXT: v_readlane_b32 s101, v0, 38 +; CHECK-NEXT: v_readlane_b32 s100, v0, 37 +; CHECK-NEXT: v_readlane_b32 s99, v0, 36 +; CHECK-NEXT: v_readlane_b32 s98, v0, 35 +; CHECK-NEXT: v_readlane_b32 s97, v0, 34 +; CHECK-NEXT: v_readlane_b32 s96, v0, 33 +; CHECK-NEXT: v_readlane_b32 s87, v0, 32 +; CHECK-NEXT: v_readlane_b32 s86, v0, 31 +; CHECK-NEXT: v_readlane_b32 s85, v0, 30 +; CHECK-NEXT: v_readlane_b32 s84, v0, 29 +; CHECK-NEXT: v_readlane_b32 s83, v0, 28 +; CHECK-NEXT: v_readlane_b32 s82, v0, 27 +; CHECK-NEXT: v_readlane_b32 s81, v0, 26 +; CHECK-NEXT: v_readlane_b32 s80, v0, 25 +; CHECK-NEXT: v_readlane_b32 s71, v0, 24 +; CHECK-NEXT: v_readlane_b32 s70, v0, 23 +; CHECK-NEXT: v_readlane_b32 s69, v0, 22 +; CHECK-NEXT: v_readlane_b32 s68, v0, 21 +; CHECK-NEXT: v_readlane_b32 s67, v0, 20 +; CHECK-NEXT: v_readlane_b32 s66, v0, 19 +; CHECK-NEXT: v_readlane_b32 s65, v0, 18 +; CHECK-NEXT: v_readlane_b32 s64, v0, 17 +; CHECK-NEXT: v_readlane_b32 s55, v0, 16 +; CHECK-NEXT: v_readlane_b32 s54, v0, 15 +; CHECK-NEXT: v_readlane_b32 s53, v0, 14 +; CHECK-NEXT: v_readlane_b32 s52, v0, 13 +; CHECK-NEXT: v_readlane_b32 s51, v0, 12 +; CHECK-NEXT: v_readlane_b32 s50, v0, 11 +; CHECK-NEXT: v_readlane_b32 s49, v0, 10 +; CHECK-NEXT: v_readlane_b32 s48, v0, 9 ; CHECK-NEXT: v_readlane_b32 s39, v0, 8 ; CHECK-NEXT: v_readlane_b32 s38, v0, 7 ; CHECK-NEXT: v_readlane_b32 s37, v0, 6 @@ -1703,7 +1637,6 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s30, v0, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll index d4c50cf2c7e4a..34f4476f7fd6a 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll @@ -1,6 +1,6 @@ -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s -; CHECK: LLVM ERROR: failed to find free scratch register +; CHECK: illegal VGPR to SGPR copy declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index 0b8ad359ccb94..c10cb0ae6d336 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -1385,15 +1385,15 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 +; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-NEXT: v_writelane_b32 v40, s29, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 vcc -; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[40:41] +; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 @@ -1408,7 +1408,7 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_mov_b32 s23, s27 ; GFX9-NEXT: s_mov_b32 s24, s28 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], vcc +; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 9bbecacd6c774..3451e389fef8b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4542,19 +4542,19 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 { ; VI-LABEL: test_call_external_i32_func_i32_imm: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s42, -1 -; VI-NEXT: s_mov_b32 s43, 0xe80000 -; VI-NEXT: s_add_u32 s40, s40, s5 +; VI-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s50, -1 +; VI-NEXT: s_mov_b32 s51, 0xe80000 +; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 -; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_addc_u32 s49, s49, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[40:41] +; VI-NEXT: s_mov_b64 s[0:1], s[48:49] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; VI-NEXT: s_mov_b64 s[2:3], s[42:43] +; VI-NEXT: s_mov_b64 s[2:3], s[50:51] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_mov_b32 s39, 0xf000 @@ -4566,19 +4566,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; CI-LABEL: test_call_external_i32_func_i32_imm: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s42, -1 -; CI-NEXT: s_mov_b32 s43, 0xe8f000 -; CI-NEXT: s_add_u32 s40, s40, s5 +; CI-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s50, -1 +; CI-NEXT: s_mov_b32 s51, 0xe8f000 +; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_addc_u32 s49, s49, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 @@ -4590,19 +4590,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; GFX9-LABEL: test_call_external_i32_func_i32_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s42, -1 -; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s5 +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s5 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_mov_b32 s39, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index db9ce56ecc3cc..67a70cdeb1ecc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -528,15 +528,16 @@ define void @callee_saved_sgpr_func() #2 { ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; MUBUF-NEXT: v_writelane_b32 v40, s40, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; def s40 ; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_mov_b32 s34, s40 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: ;;#ASMSTART -; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ; use s34 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s40, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 @@ -563,15 +564,16 @@ define void @callee_saved_sgpr_func() #2 { ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; FLATSCR-NEXT: v_writelane_b32 v40, s40, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s34, s40 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ; use s34 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s40, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 @@ -600,9 +602,10 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ; use s33 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 @@ -629,22 +632,23 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v41, s40, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s34, 2 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; def s40 ; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_mov_b32 s34, s40 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; def v40 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: ;;#ASMSTART -; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ; use s34 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; use v40 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s40, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v41, 2 ; MUBUF-NEXT: v_readlane_b32 s31, v41, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v41, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 @@ -672,22 +676,23 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: v_writelane_b32 v41, s40, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s34, 2 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s34, s40 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def v40 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ; use s34 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use v40 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: v_readlane_b32 s40, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v41, 2 ; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 @@ -718,13 +723,14 @@ define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def v32 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: v_mov_b32_e32 v40, v32 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ; use s33 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use v40 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 4c6f2d22080e0..9561aa555c80e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -257,50 +257,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; MUBUF-NEXT: v_writelane_b32 v40, s37, 1 ; MUBUF-NEXT: v_writelane_b32 v40, s38, 2 ; MUBUF-NEXT: v_writelane_b32 v40, s39, 3 -; MUBUF-NEXT: v_writelane_b32 v40, s40, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s41, 5 -; MUBUF-NEXT: v_writelane_b32 v40, s42, 6 -; MUBUF-NEXT: v_writelane_b32 v40, s43, 7 -; MUBUF-NEXT: v_writelane_b32 v40, s44, 8 -; MUBUF-NEXT: v_writelane_b32 v40, s45, 9 -; MUBUF-NEXT: v_writelane_b32 v40, s46, 10 -; MUBUF-NEXT: v_writelane_b32 v40, s47, 11 -; MUBUF-NEXT: v_writelane_b32 v40, s48, 12 -; MUBUF-NEXT: v_writelane_b32 v40, s49, 13 -; MUBUF-NEXT: v_writelane_b32 v40, s50, 14 -; MUBUF-NEXT: v_writelane_b32 v40, s51, 15 -; MUBUF-NEXT: v_writelane_b32 v40, s52, 16 -; MUBUF-NEXT: v_writelane_b32 v40, s53, 17 -; MUBUF-NEXT: v_writelane_b32 v40, s54, 18 -; MUBUF-NEXT: v_writelane_b32 v40, s55, 19 -; MUBUF-NEXT: v_writelane_b32 v40, s56, 20 -; MUBUF-NEXT: v_writelane_b32 v40, s57, 21 -; MUBUF-NEXT: v_writelane_b32 v40, s58, 22 -; MUBUF-NEXT: v_writelane_b32 v40, s59, 23 -; MUBUF-NEXT: v_writelane_b32 v40, s60, 24 -; MUBUF-NEXT: v_writelane_b32 v40, s61, 25 -; MUBUF-NEXT: v_writelane_b32 v40, s62, 26 -; MUBUF-NEXT: v_writelane_b32 v40, s63, 27 -; MUBUF-NEXT: v_writelane_b32 v40, s64, 28 -; MUBUF-NEXT: v_writelane_b32 v40, s65, 29 -; MUBUF-NEXT: v_writelane_b32 v40, s66, 30 -; MUBUF-NEXT: v_writelane_b32 v40, s67, 31 -; MUBUF-NEXT: v_writelane_b32 v40, s68, 32 -; MUBUF-NEXT: v_writelane_b32 v40, s69, 33 -; MUBUF-NEXT: v_writelane_b32 v40, s70, 34 -; MUBUF-NEXT: v_writelane_b32 v40, s71, 35 -; MUBUF-NEXT: v_writelane_b32 v40, s72, 36 -; MUBUF-NEXT: v_writelane_b32 v40, s73, 37 -; MUBUF-NEXT: v_writelane_b32 v40, s74, 38 -; MUBUF-NEXT: v_writelane_b32 v40, s75, 39 -; MUBUF-NEXT: v_writelane_b32 v40, s76, 40 -; MUBUF-NEXT: v_writelane_b32 v40, s77, 41 -; MUBUF-NEXT: v_writelane_b32 v40, s78, 42 -; MUBUF-NEXT: v_writelane_b32 v40, s79, 43 -; MUBUF-NEXT: v_writelane_b32 v40, s80, 44 -; MUBUF-NEXT: v_writelane_b32 v40, s81, 45 -; MUBUF-NEXT: v_writelane_b32 v40, s82, 46 -; MUBUF-NEXT: v_writelane_b32 v40, s83, 47 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 23 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART @@ -347,50 +323,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; use s[4:19] ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s83, v40, 47 -; MUBUF-NEXT: v_readlane_b32 s82, v40, 46 -; MUBUF-NEXT: v_readlane_b32 s81, v40, 45 -; MUBUF-NEXT: v_readlane_b32 s80, v40, 44 -; MUBUF-NEXT: v_readlane_b32 s79, v40, 43 -; MUBUF-NEXT: v_readlane_b32 s78, v40, 42 -; MUBUF-NEXT: v_readlane_b32 s77, v40, 41 -; MUBUF-NEXT: v_readlane_b32 s76, v40, 40 -; MUBUF-NEXT: v_readlane_b32 s75, v40, 39 -; MUBUF-NEXT: v_readlane_b32 s74, v40, 38 -; MUBUF-NEXT: v_readlane_b32 s73, v40, 37 -; MUBUF-NEXT: v_readlane_b32 s72, v40, 36 -; MUBUF-NEXT: v_readlane_b32 s71, v40, 35 -; MUBUF-NEXT: v_readlane_b32 s70, v40, 34 -; MUBUF-NEXT: v_readlane_b32 s69, v40, 33 -; MUBUF-NEXT: v_readlane_b32 s68, v40, 32 -; MUBUF-NEXT: v_readlane_b32 s67, v40, 31 -; MUBUF-NEXT: v_readlane_b32 s66, v40, 30 -; MUBUF-NEXT: v_readlane_b32 s65, v40, 29 -; MUBUF-NEXT: v_readlane_b32 s64, v40, 28 -; MUBUF-NEXT: v_readlane_b32 s63, v40, 27 -; MUBUF-NEXT: v_readlane_b32 s62, v40, 26 -; MUBUF-NEXT: v_readlane_b32 s61, v40, 25 -; MUBUF-NEXT: v_readlane_b32 s60, v40, 24 -; MUBUF-NEXT: v_readlane_b32 s59, v40, 23 -; MUBUF-NEXT: v_readlane_b32 s58, v40, 22 -; MUBUF-NEXT: v_readlane_b32 s57, v40, 21 -; MUBUF-NEXT: v_readlane_b32 s56, v40, 20 -; MUBUF-NEXT: v_readlane_b32 s55, v40, 19 -; MUBUF-NEXT: v_readlane_b32 s54, v40, 18 -; MUBUF-NEXT: v_readlane_b32 s53, v40, 17 -; MUBUF-NEXT: v_readlane_b32 s52, v40, 16 -; MUBUF-NEXT: v_readlane_b32 s51, v40, 15 -; MUBUF-NEXT: v_readlane_b32 s50, v40, 14 -; MUBUF-NEXT: v_readlane_b32 s49, v40, 13 -; MUBUF-NEXT: v_readlane_b32 s48, v40, 12 -; MUBUF-NEXT: v_readlane_b32 s47, v40, 11 -; MUBUF-NEXT: v_readlane_b32 s46, v40, 10 -; MUBUF-NEXT: v_readlane_b32 s45, v40, 9 -; MUBUF-NEXT: v_readlane_b32 s44, v40, 8 -; MUBUF-NEXT: v_readlane_b32 s43, v40, 7 -; MUBUF-NEXT: v_readlane_b32 s42, v40, 6 -; MUBUF-NEXT: v_readlane_b32 s41, v40, 5 -; MUBUF-NEXT: v_readlane_b32 s40, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 4 ; MUBUF-NEXT: v_readlane_b32 s39, v40, 3 ; MUBUF-NEXT: v_readlane_b32 s38, v40, 2 ; MUBUF-NEXT: v_readlane_b32 s37, v40, 1 @@ -409,48 +361,22 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 -; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 -; FLATSCR-NEXT: v_writelane_b32 v40, s36, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s37, 5 -; FLATSCR-NEXT: v_writelane_b32 v40, s38, 6 -; FLATSCR-NEXT: v_writelane_b32 v40, s39, 7 -; FLATSCR-NEXT: v_writelane_b32 v40, s40, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s41, 9 -; FLATSCR-NEXT: v_writelane_b32 v40, s42, 10 -; FLATSCR-NEXT: v_writelane_b32 v40, s43, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s44, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s45, 13 -; FLATSCR-NEXT: v_writelane_b32 v40, s46, 14 -; FLATSCR-NEXT: v_writelane_b32 v40, s47, 15 -; FLATSCR-NEXT: v_writelane_b32 v40, s48, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s49, 17 -; FLATSCR-NEXT: v_writelane_b32 v40, s50, 18 -; FLATSCR-NEXT: v_writelane_b32 v40, s51, 19 -; FLATSCR-NEXT: v_writelane_b32 v40, s52, 20 -; FLATSCR-NEXT: v_writelane_b32 v40, s53, 21 -; FLATSCR-NEXT: v_writelane_b32 v40, s54, 22 -; FLATSCR-NEXT: v_writelane_b32 v40, s55, 23 -; FLATSCR-NEXT: v_writelane_b32 v40, s56, 24 -; FLATSCR-NEXT: v_writelane_b32 v40, s57, 25 -; FLATSCR-NEXT: v_writelane_b32 v40, s58, 26 -; FLATSCR-NEXT: v_writelane_b32 v40, s59, 27 -; FLATSCR-NEXT: v_writelane_b32 v40, s60, 28 -; FLATSCR-NEXT: v_writelane_b32 v40, s61, 29 -; FLATSCR-NEXT: v_writelane_b32 v40, s62, 30 -; FLATSCR-NEXT: v_writelane_b32 v40, s63, 31 -; FLATSCR-NEXT: v_writelane_b32 v40, s64, 32 -; FLATSCR-NEXT: v_writelane_b32 v40, s65, 33 -; FLATSCR-NEXT: v_writelane_b32 v40, s66, 34 -; FLATSCR-NEXT: v_writelane_b32 v40, s67, 35 -; FLATSCR-NEXT: v_writelane_b32 v40, s68, 36 -; FLATSCR-NEXT: v_writelane_b32 v40, s69, 37 -; FLATSCR-NEXT: v_writelane_b32 v40, s70, 38 -; FLATSCR-NEXT: v_writelane_b32 v40, s71, 39 -; FLATSCR-NEXT: v_writelane_b32 v40, s72, 40 -; FLATSCR-NEXT: v_writelane_b32 v40, s73, 41 -; FLATSCR-NEXT: v_writelane_b32 v40, s74, 42 -; FLATSCR-NEXT: v_writelane_b32 v40, s75, 43 +; FLATSCR-NEXT: v_writelane_b32 v40, s36, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s37, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s38, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 17 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART @@ -474,10 +400,10 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ; def s[0:15] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; def s[68:75] +; FLATSCR-NEXT: ; def s[72:79] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; def s[34:35] +; FLATSCR-NEXT: ; def s[88:89] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[52:67] @@ -489,56 +415,30 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ; use s[16:31] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s[68:75] +; FLATSCR-NEXT: ; use s[72:79] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s[34:35] +; FLATSCR-NEXT: ; use s[88:89] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[0:15] ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s75, v40, 43 -; FLATSCR-NEXT: v_readlane_b32 s74, v40, 42 -; FLATSCR-NEXT: v_readlane_b32 s73, v40, 41 -; FLATSCR-NEXT: v_readlane_b32 s72, v40, 40 -; FLATSCR-NEXT: v_readlane_b32 s71, v40, 39 -; FLATSCR-NEXT: v_readlane_b32 s70, v40, 38 -; FLATSCR-NEXT: v_readlane_b32 s69, v40, 37 -; FLATSCR-NEXT: v_readlane_b32 s68, v40, 36 -; FLATSCR-NEXT: v_readlane_b32 s67, v40, 35 -; FLATSCR-NEXT: v_readlane_b32 s66, v40, 34 -; FLATSCR-NEXT: v_readlane_b32 s65, v40, 33 -; FLATSCR-NEXT: v_readlane_b32 s64, v40, 32 -; FLATSCR-NEXT: v_readlane_b32 s63, v40, 31 -; FLATSCR-NEXT: v_readlane_b32 s62, v40, 30 -; FLATSCR-NEXT: v_readlane_b32 s61, v40, 29 -; FLATSCR-NEXT: v_readlane_b32 s60, v40, 28 -; FLATSCR-NEXT: v_readlane_b32 s59, v40, 27 -; FLATSCR-NEXT: v_readlane_b32 s58, v40, 26 -; FLATSCR-NEXT: v_readlane_b32 s57, v40, 25 -; FLATSCR-NEXT: v_readlane_b32 s56, v40, 24 -; FLATSCR-NEXT: v_readlane_b32 s55, v40, 23 -; FLATSCR-NEXT: v_readlane_b32 s54, v40, 22 -; FLATSCR-NEXT: v_readlane_b32 s53, v40, 21 -; FLATSCR-NEXT: v_readlane_b32 s52, v40, 20 -; FLATSCR-NEXT: v_readlane_b32 s51, v40, 19 -; FLATSCR-NEXT: v_readlane_b32 s50, v40, 18 -; FLATSCR-NEXT: v_readlane_b32 s49, v40, 17 -; FLATSCR-NEXT: v_readlane_b32 s48, v40, 16 -; FLATSCR-NEXT: v_readlane_b32 s47, v40, 15 -; FLATSCR-NEXT: v_readlane_b32 s46, v40, 14 -; FLATSCR-NEXT: v_readlane_b32 s45, v40, 13 -; FLATSCR-NEXT: v_readlane_b32 s44, v40, 12 -; FLATSCR-NEXT: v_readlane_b32 s43, v40, 11 -; FLATSCR-NEXT: v_readlane_b32 s42, v40, 10 -; FLATSCR-NEXT: v_readlane_b32 s41, v40, 9 -; FLATSCR-NEXT: v_readlane_b32 s40, v40, 8 -; FLATSCR-NEXT: v_readlane_b32 s39, v40, 7 -; FLATSCR-NEXT: v_readlane_b32 s38, v40, 6 -; FLATSCR-NEXT: v_readlane_b32 s37, v40, 5 -; FLATSCR-NEXT: v_readlane_b32 s36, v40, 4 -; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s38, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s37, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s36, v40, 2 ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -571,39 +471,13 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to ; enable all lanes and restore. define void @spill_only_csr_sgpr() { -; MUBUF-LABEL: spill_only_csr_sgpr: -; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: v_writelane_b32 v0, s42, 0 -; MUBUF-NEXT: ;;#ASMSTART -; MUBUF-NEXT: ; clobber s42 -; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s42, v0, 0 -; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_setpc_b64 s[30:31] -; -; FLATSCR-LABEL: spill_only_csr_sgpr: -; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v0, s42, 0 -; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; clobber s42 -; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s42, v0, 0 -; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: spill_only_csr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber s42 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber s42", "~{s42}"() ret void } @@ -663,143 +537,79 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: v_writelane_b32 v1, s40, 0 -; MUBUF-NEXT: v_writelane_b32 v1, s41, 1 -; MUBUF-NEXT: v_writelane_b32 v1, s42, 2 -; MUBUF-NEXT: v_writelane_b32 v1, s43, 3 -; MUBUF-NEXT: v_writelane_b32 v1, s44, 4 -; MUBUF-NEXT: v_writelane_b32 v1, s45, 5 -; MUBUF-NEXT: v_writelane_b32 v1, s46, 6 -; MUBUF-NEXT: v_writelane_b32 v1, s47, 7 -; MUBUF-NEXT: v_writelane_b32 v1, s48, 8 -; MUBUF-NEXT: v_writelane_b32 v1, s49, 9 -; MUBUF-NEXT: v_writelane_b32 v1, s50, 10 -; MUBUF-NEXT: v_writelane_b32 v1, s51, 11 -; MUBUF-NEXT: v_writelane_b32 v1, s52, 12 -; MUBUF-NEXT: v_writelane_b32 v1, s53, 13 -; MUBUF-NEXT: v_writelane_b32 v1, s54, 14 -; MUBUF-NEXT: v_writelane_b32 v1, s55, 15 -; MUBUF-NEXT: v_writelane_b32 v1, s56, 16 -; MUBUF-NEXT: v_writelane_b32 v1, s57, 17 -; MUBUF-NEXT: v_writelane_b32 v1, s58, 18 -; MUBUF-NEXT: v_writelane_b32 v1, s59, 19 -; MUBUF-NEXT: v_writelane_b32 v1, s60, 20 -; MUBUF-NEXT: v_writelane_b32 v1, s61, 21 -; MUBUF-NEXT: v_writelane_b32 v1, s62, 22 -; MUBUF-NEXT: v_writelane_b32 v1, s63, 23 -; MUBUF-NEXT: v_writelane_b32 v1, s64, 24 -; MUBUF-NEXT: v_writelane_b32 v1, s65, 25 -; MUBUF-NEXT: v_writelane_b32 v1, s66, 26 -; MUBUF-NEXT: v_writelane_b32 v1, s67, 27 -; MUBUF-NEXT: v_writelane_b32 v1, s68, 28 -; MUBUF-NEXT: v_writelane_b32 v1, s69, 29 -; MUBUF-NEXT: v_writelane_b32 v1, s70, 30 -; MUBUF-NEXT: v_writelane_b32 v1, s71, 31 -; MUBUF-NEXT: v_writelane_b32 v1, s72, 32 -; MUBUF-NEXT: v_writelane_b32 v1, s73, 33 -; MUBUF-NEXT: v_writelane_b32 v1, s74, 34 -; MUBUF-NEXT: v_writelane_b32 v1, s75, 35 -; MUBUF-NEXT: v_writelane_b32 v1, s76, 36 -; MUBUF-NEXT: v_writelane_b32 v1, s77, 37 -; MUBUF-NEXT: v_writelane_b32 v1, s78, 38 -; MUBUF-NEXT: v_writelane_b32 v1, s79, 39 -; MUBUF-NEXT: v_writelane_b32 v1, s80, 40 -; MUBUF-NEXT: v_writelane_b32 v1, s81, 41 -; MUBUF-NEXT: v_writelane_b32 v1, s82, 42 -; MUBUF-NEXT: v_writelane_b32 v1, s83, 43 -; MUBUF-NEXT: v_writelane_b32 v1, s84, 44 -; MUBUF-NEXT: v_writelane_b32 v1, s85, 45 -; MUBUF-NEXT: v_writelane_b32 v1, s86, 46 -; MUBUF-NEXT: v_writelane_b32 v1, s87, 47 -; MUBUF-NEXT: v_writelane_b32 v1, s88, 48 -; MUBUF-NEXT: v_writelane_b32 v1, s89, 49 -; MUBUF-NEXT: v_writelane_b32 v1, s90, 50 -; MUBUF-NEXT: v_writelane_b32 v1, s91, 51 -; MUBUF-NEXT: v_writelane_b32 v1, s92, 52 -; MUBUF-NEXT: v_writelane_b32 v1, s93, 53 -; MUBUF-NEXT: v_writelane_b32 v1, s94, 54 -; MUBUF-NEXT: v_writelane_b32 v1, s95, 55 -; MUBUF-NEXT: v_writelane_b32 v1, s96, 56 -; MUBUF-NEXT: v_writelane_b32 v1, s97, 57 -; MUBUF-NEXT: v_writelane_b32 v1, s98, 58 -; MUBUF-NEXT: v_writelane_b32 v1, s99, 59 -; MUBUF-NEXT: v_writelane_b32 v1, s100, 60 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 28 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v1, s101, 61 +; MUBUF-NEXT: v_writelane_b32 v1, s101, 29 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber v41 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_writelane_b32 v1, s102, 62 +; MUBUF-NEXT: v_writelane_b32 v1, s102, 30 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_readlane_b32 s102, v1, 62 -; MUBUF-NEXT: v_readlane_b32 s101, v1, 61 -; MUBUF-NEXT: v_readlane_b32 s100, v1, 60 -; MUBUF-NEXT: v_readlane_b32 s99, v1, 59 -; MUBUF-NEXT: v_readlane_b32 s98, v1, 58 -; MUBUF-NEXT: v_readlane_b32 s97, v1, 57 -; MUBUF-NEXT: v_readlane_b32 s96, v1, 56 -; MUBUF-NEXT: v_readlane_b32 s95, v1, 55 -; MUBUF-NEXT: v_readlane_b32 s94, v1, 54 -; MUBUF-NEXT: v_readlane_b32 s93, v1, 53 -; MUBUF-NEXT: v_readlane_b32 s92, v1, 52 -; MUBUF-NEXT: v_readlane_b32 s91, v1, 51 -; MUBUF-NEXT: v_readlane_b32 s90, v1, 50 -; MUBUF-NEXT: v_readlane_b32 s89, v1, 49 -; MUBUF-NEXT: v_readlane_b32 s88, v1, 48 -; MUBUF-NEXT: v_readlane_b32 s87, v1, 47 -; MUBUF-NEXT: v_readlane_b32 s86, v1, 46 -; MUBUF-NEXT: v_readlane_b32 s85, v1, 45 -; MUBUF-NEXT: v_readlane_b32 s84, v1, 44 -; MUBUF-NEXT: v_readlane_b32 s83, v1, 43 -; MUBUF-NEXT: v_readlane_b32 s82, v1, 42 -; MUBUF-NEXT: v_readlane_b32 s81, v1, 41 -; MUBUF-NEXT: v_readlane_b32 s80, v1, 40 -; MUBUF-NEXT: v_readlane_b32 s79, v1, 39 -; MUBUF-NEXT: v_readlane_b32 s78, v1, 38 -; MUBUF-NEXT: v_readlane_b32 s77, v1, 37 -; MUBUF-NEXT: v_readlane_b32 s76, v1, 36 -; MUBUF-NEXT: v_readlane_b32 s75, v1, 35 -; MUBUF-NEXT: v_readlane_b32 s74, v1, 34 -; MUBUF-NEXT: v_readlane_b32 s73, v1, 33 -; MUBUF-NEXT: v_readlane_b32 s72, v1, 32 -; MUBUF-NEXT: v_readlane_b32 s71, v1, 31 -; MUBUF-NEXT: v_readlane_b32 s70, v1, 30 -; MUBUF-NEXT: v_readlane_b32 s69, v1, 29 -; MUBUF-NEXT: v_readlane_b32 s68, v1, 28 -; MUBUF-NEXT: v_readlane_b32 s67, v1, 27 -; MUBUF-NEXT: v_readlane_b32 s66, v1, 26 -; MUBUF-NEXT: v_readlane_b32 s65, v1, 25 -; MUBUF-NEXT: v_readlane_b32 s64, v1, 24 -; MUBUF-NEXT: v_readlane_b32 s63, v1, 23 -; MUBUF-NEXT: v_readlane_b32 s62, v1, 22 -; MUBUF-NEXT: v_readlane_b32 s61, v1, 21 -; MUBUF-NEXT: v_readlane_b32 s60, v1, 20 -; MUBUF-NEXT: v_readlane_b32 s59, v1, 19 -; MUBUF-NEXT: v_readlane_b32 s58, v1, 18 -; MUBUF-NEXT: v_readlane_b32 s57, v1, 17 -; MUBUF-NEXT: v_readlane_b32 s56, v1, 16 -; MUBUF-NEXT: v_readlane_b32 s55, v1, 15 -; MUBUF-NEXT: v_readlane_b32 s54, v1, 14 -; MUBUF-NEXT: v_readlane_b32 s53, v1, 13 -; MUBUF-NEXT: v_readlane_b32 s52, v1, 12 -; MUBUF-NEXT: v_readlane_b32 s51, v1, 11 -; MUBUF-NEXT: v_readlane_b32 s50, v1, 10 -; MUBUF-NEXT: v_readlane_b32 s49, v1, 9 -; MUBUF-NEXT: v_readlane_b32 s48, v1, 8 -; MUBUF-NEXT: v_readlane_b32 s47, v1, 7 -; MUBUF-NEXT: v_readlane_b32 s46, v1, 6 -; MUBUF-NEXT: v_readlane_b32 s45, v1, 5 -; MUBUF-NEXT: v_readlane_b32 s44, v1, 4 -; MUBUF-NEXT: v_readlane_b32 s43, v1, 3 -; MUBUF-NEXT: v_readlane_b32 s42, v1, 2 -; MUBUF-NEXT: v_readlane_b32 s41, v1, 1 -; MUBUF-NEXT: v_readlane_b32 s40, v1, 0 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -816,143 +626,79 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] -; FLATSCR-NEXT: v_writelane_b32 v1, s40, 0 -; FLATSCR-NEXT: v_writelane_b32 v1, s41, 1 -; FLATSCR-NEXT: v_writelane_b32 v1, s42, 2 -; FLATSCR-NEXT: v_writelane_b32 v1, s43, 3 -; FLATSCR-NEXT: v_writelane_b32 v1, s44, 4 -; FLATSCR-NEXT: v_writelane_b32 v1, s45, 5 -; FLATSCR-NEXT: v_writelane_b32 v1, s46, 6 -; FLATSCR-NEXT: v_writelane_b32 v1, s47, 7 -; FLATSCR-NEXT: v_writelane_b32 v1, s48, 8 -; FLATSCR-NEXT: v_writelane_b32 v1, s49, 9 -; FLATSCR-NEXT: v_writelane_b32 v1, s50, 10 -; FLATSCR-NEXT: v_writelane_b32 v1, s51, 11 -; FLATSCR-NEXT: v_writelane_b32 v1, s52, 12 -; FLATSCR-NEXT: v_writelane_b32 v1, s53, 13 -; FLATSCR-NEXT: v_writelane_b32 v1, s54, 14 -; FLATSCR-NEXT: v_writelane_b32 v1, s55, 15 -; FLATSCR-NEXT: v_writelane_b32 v1, s56, 16 -; FLATSCR-NEXT: v_writelane_b32 v1, s57, 17 -; FLATSCR-NEXT: v_writelane_b32 v1, s58, 18 -; FLATSCR-NEXT: v_writelane_b32 v1, s59, 19 -; FLATSCR-NEXT: v_writelane_b32 v1, s60, 20 -; FLATSCR-NEXT: v_writelane_b32 v1, s61, 21 -; FLATSCR-NEXT: v_writelane_b32 v1, s62, 22 -; FLATSCR-NEXT: v_writelane_b32 v1, s63, 23 -; FLATSCR-NEXT: v_writelane_b32 v1, s64, 24 -; FLATSCR-NEXT: v_writelane_b32 v1, s65, 25 -; FLATSCR-NEXT: v_writelane_b32 v1, s66, 26 -; FLATSCR-NEXT: v_writelane_b32 v1, s67, 27 -; FLATSCR-NEXT: v_writelane_b32 v1, s68, 28 -; FLATSCR-NEXT: v_writelane_b32 v1, s69, 29 -; FLATSCR-NEXT: v_writelane_b32 v1, s70, 30 -; FLATSCR-NEXT: v_writelane_b32 v1, s71, 31 -; FLATSCR-NEXT: v_writelane_b32 v1, s72, 32 -; FLATSCR-NEXT: v_writelane_b32 v1, s73, 33 -; FLATSCR-NEXT: v_writelane_b32 v1, s74, 34 -; FLATSCR-NEXT: v_writelane_b32 v1, s75, 35 -; FLATSCR-NEXT: v_writelane_b32 v1, s76, 36 -; FLATSCR-NEXT: v_writelane_b32 v1, s77, 37 -; FLATSCR-NEXT: v_writelane_b32 v1, s78, 38 -; FLATSCR-NEXT: v_writelane_b32 v1, s79, 39 -; FLATSCR-NEXT: v_writelane_b32 v1, s80, 40 -; FLATSCR-NEXT: v_writelane_b32 v1, s81, 41 -; FLATSCR-NEXT: v_writelane_b32 v1, s82, 42 -; FLATSCR-NEXT: v_writelane_b32 v1, s83, 43 -; FLATSCR-NEXT: v_writelane_b32 v1, s84, 44 -; FLATSCR-NEXT: v_writelane_b32 v1, s85, 45 -; FLATSCR-NEXT: v_writelane_b32 v1, s86, 46 -; FLATSCR-NEXT: v_writelane_b32 v1, s87, 47 -; FLATSCR-NEXT: v_writelane_b32 v1, s88, 48 -; FLATSCR-NEXT: v_writelane_b32 v1, s89, 49 -; FLATSCR-NEXT: v_writelane_b32 v1, s90, 50 -; FLATSCR-NEXT: v_writelane_b32 v1, s91, 51 -; FLATSCR-NEXT: v_writelane_b32 v1, s92, 52 -; FLATSCR-NEXT: v_writelane_b32 v1, s93, 53 -; FLATSCR-NEXT: v_writelane_b32 v1, s94, 54 -; FLATSCR-NEXT: v_writelane_b32 v1, s95, 55 -; FLATSCR-NEXT: v_writelane_b32 v1, s96, 56 -; FLATSCR-NEXT: v_writelane_b32 v1, s97, 57 -; FLATSCR-NEXT: v_writelane_b32 v1, s98, 58 -; FLATSCR-NEXT: v_writelane_b32 v1, s99, 59 -; FLATSCR-NEXT: v_writelane_b32 v1, s100, 60 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 28 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: v_writelane_b32 v1, s101, 61 +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 29 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber v41 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_writelane_b32 v1, s102, 62 +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 30 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_readlane_b32 s102, v1, 62 -; FLATSCR-NEXT: v_readlane_b32 s101, v1, 61 -; FLATSCR-NEXT: v_readlane_b32 s100, v1, 60 -; FLATSCR-NEXT: v_readlane_b32 s99, v1, 59 -; FLATSCR-NEXT: v_readlane_b32 s98, v1, 58 -; FLATSCR-NEXT: v_readlane_b32 s97, v1, 57 -; FLATSCR-NEXT: v_readlane_b32 s96, v1, 56 -; FLATSCR-NEXT: v_readlane_b32 s95, v1, 55 -; FLATSCR-NEXT: v_readlane_b32 s94, v1, 54 -; FLATSCR-NEXT: v_readlane_b32 s93, v1, 53 -; FLATSCR-NEXT: v_readlane_b32 s92, v1, 52 -; FLATSCR-NEXT: v_readlane_b32 s91, v1, 51 -; FLATSCR-NEXT: v_readlane_b32 s90, v1, 50 -; FLATSCR-NEXT: v_readlane_b32 s89, v1, 49 -; FLATSCR-NEXT: v_readlane_b32 s88, v1, 48 -; FLATSCR-NEXT: v_readlane_b32 s87, v1, 47 -; FLATSCR-NEXT: v_readlane_b32 s86, v1, 46 -; FLATSCR-NEXT: v_readlane_b32 s85, v1, 45 -; FLATSCR-NEXT: v_readlane_b32 s84, v1, 44 -; FLATSCR-NEXT: v_readlane_b32 s83, v1, 43 -; FLATSCR-NEXT: v_readlane_b32 s82, v1, 42 -; FLATSCR-NEXT: v_readlane_b32 s81, v1, 41 -; FLATSCR-NEXT: v_readlane_b32 s80, v1, 40 -; FLATSCR-NEXT: v_readlane_b32 s79, v1, 39 -; FLATSCR-NEXT: v_readlane_b32 s78, v1, 38 -; FLATSCR-NEXT: v_readlane_b32 s77, v1, 37 -; FLATSCR-NEXT: v_readlane_b32 s76, v1, 36 -; FLATSCR-NEXT: v_readlane_b32 s75, v1, 35 -; FLATSCR-NEXT: v_readlane_b32 s74, v1, 34 -; FLATSCR-NEXT: v_readlane_b32 s73, v1, 33 -; FLATSCR-NEXT: v_readlane_b32 s72, v1, 32 -; FLATSCR-NEXT: v_readlane_b32 s71, v1, 31 -; FLATSCR-NEXT: v_readlane_b32 s70, v1, 30 -; FLATSCR-NEXT: v_readlane_b32 s69, v1, 29 -; FLATSCR-NEXT: v_readlane_b32 s68, v1, 28 -; FLATSCR-NEXT: v_readlane_b32 s67, v1, 27 -; FLATSCR-NEXT: v_readlane_b32 s66, v1, 26 -; FLATSCR-NEXT: v_readlane_b32 s65, v1, 25 -; FLATSCR-NEXT: v_readlane_b32 s64, v1, 24 -; FLATSCR-NEXT: v_readlane_b32 s63, v1, 23 -; FLATSCR-NEXT: v_readlane_b32 s62, v1, 22 -; FLATSCR-NEXT: v_readlane_b32 s61, v1, 21 -; FLATSCR-NEXT: v_readlane_b32 s60, v1, 20 -; FLATSCR-NEXT: v_readlane_b32 s59, v1, 19 -; FLATSCR-NEXT: v_readlane_b32 s58, v1, 18 -; FLATSCR-NEXT: v_readlane_b32 s57, v1, 17 -; FLATSCR-NEXT: v_readlane_b32 s56, v1, 16 -; FLATSCR-NEXT: v_readlane_b32 s55, v1, 15 -; FLATSCR-NEXT: v_readlane_b32 s54, v1, 14 -; FLATSCR-NEXT: v_readlane_b32 s53, v1, 13 -; FLATSCR-NEXT: v_readlane_b32 s52, v1, 12 -; FLATSCR-NEXT: v_readlane_b32 s51, v1, 11 -; FLATSCR-NEXT: v_readlane_b32 s50, v1, 10 -; FLATSCR-NEXT: v_readlane_b32 s49, v1, 9 -; FLATSCR-NEXT: v_readlane_b32 s48, v1, 8 -; FLATSCR-NEXT: v_readlane_b32 s47, v1, 7 -; FLATSCR-NEXT: v_readlane_b32 s46, v1, 6 -; FLATSCR-NEXT: v_readlane_b32 s45, v1, 5 -; FLATSCR-NEXT: v_readlane_b32 s44, v1, 4 -; FLATSCR-NEXT: v_readlane_b32 s43, v1, 3 -; FLATSCR-NEXT: v_readlane_b32 s42, v1, 2 -; FLATSCR-NEXT: v_readlane_b32 s41, v1, 1 -; FLATSCR-NEXT: v_readlane_b32 s40, v1, 0 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload @@ -986,143 +732,79 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: v_writelane_b32 v1, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v1, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v1, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v1, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v1, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v1, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v1, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v1, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v1, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v1, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v1, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v1, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v1, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v1, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v1, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v1, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v1, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v1, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v1, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v1, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v1, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v1, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v1, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v1, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v1, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v1, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v1, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v1, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v1, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v1, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v1, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v1, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v1, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v1, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v1, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v1, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v1, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v1, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v1, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v1, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v1, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v1, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v1, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v1, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v1, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v1, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v1, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v1, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v1, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v1, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v1, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v1, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v1, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v1, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v1, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v1, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v1, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v1, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v1, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v1, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v1, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v1, s100, 61 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 29 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v1, s101, 62 +; MUBUF-NEXT: v_writelane_b32 v1, s101, 30 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber v41 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_writelane_b32 v1, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v1, s102, 31 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_readlane_b32 s102, v1, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v1, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v1, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v1, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v1, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v1, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v1, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v1, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v1, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v1, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v1, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v1, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v1, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v1, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v1, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v1, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v1, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v1, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v1, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v1, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v1, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v1, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v1, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v1, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v1, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v1, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v1, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v1, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v1, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v1, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v1, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v1, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v1, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v1, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v1, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v1, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v1, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v1, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v1, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v1, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v1, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v1, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v1, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v1, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v1, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v1, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v1, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v1, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v1, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v1, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v1, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v1, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v1, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v1, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v1, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v1, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v1, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v1, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v1, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v1, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v1, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v1, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v1, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 @@ -1141,143 +823,79 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v1, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v1, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v1, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v1, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v1, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v1, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v1, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v1, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v1, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v1, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v1, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v1, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v1, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v1, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v1, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v1, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v1, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v1, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v1, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v1, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v1, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v1, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v1, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v1, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v1, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v1, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v1, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v1, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v1, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v1, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v1, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v1, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v1, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v1, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v1, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v1, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v1, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v1, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v1, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v1, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v1, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v1, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v1, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v1, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v1, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v1, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v1, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v1, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v1, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v1, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v1, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v1, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v1, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v1, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v1, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v1, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v1, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v1, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v1, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v1, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v1, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v1, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 29 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: v_writelane_b32 v1, s101, 62 +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 30 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber v41 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_writelane_b32 v1, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 31 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_readlane_b32 s102, v1, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v1, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v1, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v1, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v1, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v1, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v1, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v1, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v1, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v1, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v1, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v1, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v1, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v1, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v1, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v1, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v1, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v1, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v1, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v1, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v1, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v1, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v1, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v1, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v1, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v1, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v1, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v1, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v1, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v1, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v1, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v1, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v1, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v1, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v1, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v1, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v1, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v1, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v1, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v1, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v1, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v1, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v1, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v1, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v1, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v1, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v1, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v1, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v1, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v1, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v1, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v1, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v1, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v1, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v1, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v1, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v1, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v1, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v1, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v1, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v1, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v1, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v1, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 @@ -1346,7 +964,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1365,14 +983,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill @@ -1391,7 +1009,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1412,7 +1030,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1434,14 +1052,14 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill @@ -1463,7 +1081,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1491,7 +1109,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; MUBUF-LABEL: scratch_reg_needed_mubuf_offset: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 @@ -1517,14 +1135,14 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 @@ -1550,7 +1168,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 ; FLATSCR-NEXT: scratch_load_dword v40, off, s2 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1650,22 +1268,15 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; MUBUF-LABEL: callee_need_to_spill_fp_to_memory: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: s_addk_i32 s32, 0x200 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 -; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory: @@ -1707,147 +1318,80 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s4, 32 ; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 30 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 31 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: v_readlane_b32 s4, v39, 32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] @@ -1864,139 +1408,75 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; FLATSCR-NEXT: scratch_store_dword off, v39, s33 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30 ; FLATSCR-NEXT: s_add_i32 s32, s32, 8 -; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 @@ -2037,149 +1517,83 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 32 ; MUBUF-NEXT: v_writelane_b32 v40, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v40, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v40, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v40, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v40, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v40, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v40, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v40, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v40, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v40, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v40, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v40, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v40, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v40, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v40, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v40, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v40, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v40, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v40, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v40, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v40, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v40, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v40, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v40, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v40, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v40, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v40, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v40, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v40, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v40, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v40, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v40, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v40, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v40, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v40, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v40, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v40, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v40, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v40, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v40, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v40, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v40, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v40, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v40, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v40, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v40, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v40, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v40, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v40, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v40, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v40, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v40, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v40, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v40, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v40, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v40, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v40, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v40, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v40, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v40, s100, 61 -; MUBUF-NEXT: v_writelane_b32 v40, s101, 62 -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 -; MUBUF-NEXT: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: v_writelane_b32 v40, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s101, 30 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: v_writelane_b32 v40, s102, 31 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s102, v40, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v40, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v40, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v40, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v40, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v40, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v40, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v40, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v40, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v40, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v40, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v40, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v40, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v40, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v40, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v40, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v40, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v40, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v40, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v40, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v40, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v40, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v40, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v40, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v40, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v40, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v40, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v40, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v40, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v40, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v40, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v40, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v40, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v40, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v40, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v40, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v40, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v40, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v40, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v40, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v40, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v40, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v40, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v40, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v40, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v40, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v40, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v40, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v40, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v40, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v40, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v40, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v40, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v40, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v40, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v40, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v40, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v40, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v40, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v40, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v40, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v40, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2194,139 +1608,75 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v40, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v40, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v40, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v40, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v40, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v40, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v40, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v40, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v40, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v40, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v40, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v40, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v40, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v40, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v40, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v40, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v40, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v40, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v40, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v40, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v40, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v40, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v40, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v40, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v40, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v40, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v40, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v40, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v40, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v40, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v40, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v40, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v40, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v40, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v40, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v40, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v40, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v40, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v40, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v40, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v40, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v40, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v40, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v40, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v40, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v40, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v40, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v40, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v40, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v40, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v40, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v40, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v40, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v40, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v40, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v40, s101, 62 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s101, 30 ; FLATSCR-NEXT: s_add_i32 s32, s32, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v40, s102, 31 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v40, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v40, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v40, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v40, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v40, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v40, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v40, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v40, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v40, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v40, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v40, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v40, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v40, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v40, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v40, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v40, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v40, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v40, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v40, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v40, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v40, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v40, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v40, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v40, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v40, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v40, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v40, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v40, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v40, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v40, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v40, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v40, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v40, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v40, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v40, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v40, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v40, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v40, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v40, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v40, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v40, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v40, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v40, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v40, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v40, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v40, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v40, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v40, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v40, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v40, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v40, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v40, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v40, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v40, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v40, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v40, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v40, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v40, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v40, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v40, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -2367,75 +1717,42 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s4, 32 ; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 -; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 30 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 -; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 31 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART @@ -2444,76 +1761,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; MUBUF-NEXT: s_add_i32 s32, s32, 0x40400 -; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: v_readlane_b32 s4, v39, 32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload @@ -2532,72 +1813,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; FLATSCR-NEXT: scratch_store_dword off, v39, s1 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28 ; FLATSCR-NEXT: s_addk_i32 s32, 0x100c -; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000 -; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31 ; FLATSCR-NEXT: scratch_store_dword off, v0, s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -2606,69 +1855,37 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir index a14d515688a8b..6504f48333485 100644 --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -14,15 +14,7 @@ body: | ; CHECK-LABEL: name: def_csr_sgpr ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr46, 2, $vgpr0 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr47, 3, $vgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 4d9c85ef99dcd..9b91a3dc9b6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1321,19 +1321,19 @@ bb: define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) { ; CI-LABEL: ds_read_call_read: ; CI: ; %bb.0: -; CI-NEXT: s_getpc_b64 s[40:41] -; CI-NEXT: s_mov_b32 s40, s0 -; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 +; CI-NEXT: s_getpc_b64 s[48:49] +; CI-NEXT: s_mov_b32 s48, s0 +; CI-NEXT: s_load_dwordx4 s[48:51], s[48:49], 0x0 ; CI-NEXT: s_mov_b32 s14, s10 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s12, s8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s40, s40, s11 +; CI-NEXT: s_add_u32 s48, s48, s11 ; CI-NEXT: s_mov_b64 s[10:11], s[6:7] ; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; CI-NEXT: s_load_dword s6, s[4:5], 0x2 -; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_addc_u32 s49, s49, 0 ; CI-NEXT: s_add_u32 s8, s4, 12 ; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CI-NEXT: s_mov_b32 s13, s9 @@ -1345,11 +1345,11 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; CI-NEXT: v_or_b32_e32 v31, v0, v2 -; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 ; CI-NEXT: s_mov_b32 s38, -1 diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 8b02bdbb70b7b..40cdfd76d6af6 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -30,59 +30,59 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: v_writelane_b32 v41, s37, 5 ; CHECK-NEXT: v_writelane_b32 v41, s38, 6 ; CHECK-NEXT: v_writelane_b32 v41, s39, 7 -; CHECK-NEXT: v_writelane_b32 v41, s40, 8 -; CHECK-NEXT: v_writelane_b32 v41, s41, 9 -; CHECK-NEXT: v_writelane_b32 v41, s42, 10 -; CHECK-NEXT: v_writelane_b32 v41, s43, 11 -; CHECK-NEXT: v_writelane_b32 v41, s44, 12 +; CHECK-NEXT: v_writelane_b32 v41, s48, 8 +; CHECK-NEXT: v_writelane_b32 v41, s49, 9 +; CHECK-NEXT: v_writelane_b32 v41, s50, 10 +; CHECK-NEXT: v_writelane_b32 v41, s51, 11 +; CHECK-NEXT: v_writelane_b32 v41, s52, 12 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v41, s45, 13 -; CHECK-NEXT: v_writelane_b32 v41, s46, 14 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: v_writelane_b32 v41, s53, 13 +; CHECK-NEXT: v_writelane_b32 v41, s54, 14 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 1 49 9 prologue_end ; dummy:49:9 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v41, s47, 15 -; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v41, s55, 15 +; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s47, v41, 15 -; CHECK-NEXT: v_readlane_b32 s46, v41, 14 -; CHECK-NEXT: v_readlane_b32 s45, v41, 13 -; CHECK-NEXT: v_readlane_b32 s44, v41, 12 -; CHECK-NEXT: v_readlane_b32 s43, v41, 11 -; CHECK-NEXT: v_readlane_b32 s42, v41, 10 -; CHECK-NEXT: v_readlane_b32 s41, v41, 9 -; CHECK-NEXT: v_readlane_b32 s40, v41, 8 +; CHECK-NEXT: v_readlane_b32 s55, v41, 15 +; CHECK-NEXT: v_readlane_b32 s54, v41, 14 +; CHECK-NEXT: v_readlane_b32 s53, v41, 13 +; CHECK-NEXT: v_readlane_b32 s52, v41, 12 +; CHECK-NEXT: v_readlane_b32 s51, v41, 11 +; CHECK-NEXT: v_readlane_b32 s50, v41, 10 +; CHECK-NEXT: v_readlane_b32 s49, v41, 9 +; CHECK-NEXT: v_readlane_b32 s48, v41, 8 ; CHECK-NEXT: v_readlane_b32 s39, v41, 7 ; CHECK-NEXT: v_readlane_b32 s38, v41, 6 ; CHECK-NEXT: v_readlane_b32 s37, v41, 5 diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir index 0714def30053d..7f370b2cca658 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir @@ -142,9 +142,8 @@ body: | ; GFX1100-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22 ; GFX1100-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23 ; GFX1100-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27 - ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc - ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32 - ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc + ; GFX1100-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc + ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40 ; GFX1100-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec @@ -168,9 +167,8 @@ body: | ; GFX1200-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22 ; GFX1200-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23 ; GFX1200-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27 - ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc - ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32 - ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc + ; GFX1200-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc + ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40 ; GFX1200-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec @@ -706,8 +704,9 @@ body: | ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 24, 64, $sgpr32, 0, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec + ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 24 + ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 @@ -810,10 +809,10 @@ body: | ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc - ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc + ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -837,10 +836,10 @@ body: | ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc - ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc + ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -900,9 +899,9 @@ body: | ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec + ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 68 + ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 @@ -1005,10 +1004,10 @@ body: | ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc - ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc + ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -1032,10 +1031,10 @@ body: | ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc - ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc + ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 32f255df82499..dc20ae3765069 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -2060,9 +2060,9 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 +; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-NEXT: v_mov_b32_e32 v2, s28 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 ; GFX9-NEXT: v_mov_b32_e32 v5, s27 diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index 22257d3eba7d6..512d58d3f996d 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -202,18 +202,18 @@ define void @indirect_use_50_vgpr() #0 { } ; GCN-LABEL: {{^}}use_80_sgpr: -; GCN: .set use_80_sgpr.num_vgpr, 1 +; GCN: .set use_80_sgpr.num_vgpr, 0 ; GCN: .set use_80_sgpr.num_agpr, 0 ; GCN: .set use_80_sgpr.numbered_sgpr, 80 -; GCN: .set use_80_sgpr.private_seg_size, 8 +; GCN: .set use_80_sgpr.private_seg_size, 0 ; GCN: .set use_80_sgpr.uses_vcc, 0 ; GCN: .set use_80_sgpr.uses_flat_scratch, 0 ; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0 ; GCN: .set use_80_sgpr.has_recursion, 0 ; GCN: .set use_80_sgpr.has_indirect_call, 0 ; GCN: TotalNumSgprs: 84 -; GCN: NumVgprs: 1 -; GCN: ScratchSize: 8 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 define void @use_80_sgpr() #1 { call void asm sideeffect "", "~{s79}"() #0 ret void @@ -231,7 +231,7 @@ define void @use_80_sgpr() #1 { ; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call) ; GCN: TotalNumSgprs: 84 ; GCN: NumVgprs: 41 -; GCN: ScratchSize: 24 +; GCN: ScratchSize: 16 define void @indirect_use_80_sgpr() #1 { call void @use_80_sgpr() ret void @@ -249,7 +249,7 @@ define void @indirect_use_80_sgpr() #1 { ; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call) ; GCN: TotalNumSgprs: 86 ; GCN: NumVgprs: 41 -; GCN: ScratchSize: 24 +; GCN: ScratchSize: 16 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void @@ -336,14 +336,14 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 { ; GCN-LABEL: {{^}}multi_call_use_use_stack: ; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) ; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) -; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(44, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) +; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(52, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) ; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) ; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) ; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) ; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion) ; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call) -; GCN: TotalNumSgprs: 50 +; GCN: TotalNumSgprs: 58 ; GCN: NumVgprs: 41 ; GCN: ScratchSize: 2052 define amdgpu_kernel void @multi_call_use_use_stack() #0 { @@ -357,7 +357,7 @@ declare void @external() #0 ; GCN-LABEL: {{^}}multi_call_with_external: ; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external.numbered_sgpr, max(44, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external.numbered_sgpr, max(52, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external.uses_vcc, 1 ; GCN: .set multi_call_with_external.uses_flat_scratch, 1 @@ -377,7 +377,7 @@ define amdgpu_kernel void @multi_call_with_external() #0 { ; GCN-LABEL: {{^}}multi_call_with_external_and_duplicates: ; GCN: .set multi_call_with_external_and_duplicates.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external_and_duplicates.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(46, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(54, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external_and_duplicates.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external_and_duplicates.uses_vcc, 1 ; GCN: .set multi_call_with_external_and_duplicates.uses_flat_scratch, 1 @@ -594,7 +594,7 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 { ; GCN-LABEL: {{^}}multi_call_with_multi_stage_recurse: ; GCN: .set multi_call_with_multi_stage_recurse.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr, multi_stage_recurse1.num_vgpr) ; GCN: .set multi_call_with_multi_stage_recurse.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr, multi_stage_recurse1.num_agpr) -; GCN: .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(45, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr) +; GCN: .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(53, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr) ; GCN: .set multi_call_with_multi_stage_recurse.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size)) ; GCN: .set multi_call_with_multi_stage_recurse.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc, multi_stage_recurse1.uses_vcc) ; GCN: .set multi_call_with_multi_stage_recurse.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch, multi_stage_recurse1.uses_flat_scratch) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index 1ad365df2e8a8..8ac187eacf1fe 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -40,12 +40,44 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_writelane_b32 v40, s28, 24 ; SDAG-NEXT: v_writelane_b32 v40, s29, 25 ; SDAG-NEXT: v_writelane_b32 v40, s30, 26 +; SDAG-NEXT: v_writelane_b32 v40, s31, 27 +; SDAG-NEXT: v_writelane_b32 v40, s72, 28 +; SDAG-NEXT: v_writelane_b32 v40, s73, 29 +; SDAG-NEXT: v_writelane_b32 v40, s74, 30 +; SDAG-NEXT: v_writelane_b32 v40, s75, 31 +; SDAG-NEXT: v_writelane_b32 v40, s76, 32 +; SDAG-NEXT: v_writelane_b32 v40, s77, 33 +; SDAG-NEXT: v_writelane_b32 v40, s78, 34 +; SDAG-NEXT: v_writelane_b32 v40, s79, 35 +; SDAG-NEXT: v_writelane_b32 v40, s88, 36 +; SDAG-NEXT: v_writelane_b32 v40, s89, 37 +; SDAG-NEXT: v_writelane_b32 v40, s90, 38 +; SDAG-NEXT: v_writelane_b32 v40, s91, 39 +; SDAG-NEXT: v_writelane_b32 v40, s92, 40 +; SDAG-NEXT: v_writelane_b32 v40, s93, 41 +; SDAG-NEXT: v_writelane_b32 v40, s94, 42 ; SDAG-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; SDAG-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: s_addk_i32 s32, 0x400 -; SDAG-NEXT: v_writelane_b32 v40, s31, 27 +; SDAG-NEXT: v_writelane_b32 v40, s95, 43 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35] +; SDAG-NEXT: v_readlane_b32 s95, v40, 43 +; SDAG-NEXT: v_readlane_b32 s94, v40, 42 +; SDAG-NEXT: v_readlane_b32 s93, v40, 41 +; SDAG-NEXT: v_readlane_b32 s92, v40, 40 +; SDAG-NEXT: v_readlane_b32 s91, v40, 39 +; SDAG-NEXT: v_readlane_b32 s90, v40, 38 +; SDAG-NEXT: v_readlane_b32 s89, v40, 37 +; SDAG-NEXT: v_readlane_b32 s88, v40, 36 +; SDAG-NEXT: v_readlane_b32 s79, v40, 35 +; SDAG-NEXT: v_readlane_b32 s78, v40, 34 +; SDAG-NEXT: v_readlane_b32 s77, v40, 33 +; SDAG-NEXT: v_readlane_b32 s76, v40, 32 +; SDAG-NEXT: v_readlane_b32 s75, v40, 31 +; SDAG-NEXT: v_readlane_b32 s74, v40, 30 +; SDAG-NEXT: v_readlane_b32 s73, v40, 29 +; SDAG-NEXT: v_readlane_b32 s72, v40, 28 ; SDAG-NEXT: v_readlane_b32 s31, v40, 27 ; SDAG-NEXT: v_readlane_b32 s30, v40, 26 ; SDAG-NEXT: v_readlane_b32 s29, v40, 25 @@ -117,12 +149,44 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_writelane_b32 v40, s28, 24 ; GISEL-NEXT: v_writelane_b32 v40, s29, 25 ; GISEL-NEXT: v_writelane_b32 v40, s30, 26 +; GISEL-NEXT: v_writelane_b32 v40, s31, 27 +; GISEL-NEXT: v_writelane_b32 v40, s72, 28 +; GISEL-NEXT: v_writelane_b32 v40, s73, 29 +; GISEL-NEXT: v_writelane_b32 v40, s74, 30 +; GISEL-NEXT: v_writelane_b32 v40, s75, 31 +; GISEL-NEXT: v_writelane_b32 v40, s76, 32 +; GISEL-NEXT: v_writelane_b32 v40, s77, 33 +; GISEL-NEXT: v_writelane_b32 v40, s78, 34 +; GISEL-NEXT: v_writelane_b32 v40, s79, 35 +; GISEL-NEXT: v_writelane_b32 v40, s88, 36 +; GISEL-NEXT: v_writelane_b32 v40, s89, 37 +; GISEL-NEXT: v_writelane_b32 v40, s90, 38 +; GISEL-NEXT: v_writelane_b32 v40, s91, 39 +; GISEL-NEXT: v_writelane_b32 v40, s92, 40 +; GISEL-NEXT: v_writelane_b32 v40, s93, 41 +; GISEL-NEXT: v_writelane_b32 v40, s94, 42 ; GISEL-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; GISEL-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s31, 27 +; GISEL-NEXT: v_writelane_b32 v40, s95, 43 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GISEL-NEXT: v_readlane_b32 s95, v40, 43 +; GISEL-NEXT: v_readlane_b32 s94, v40, 42 +; GISEL-NEXT: v_readlane_b32 s93, v40, 41 +; GISEL-NEXT: v_readlane_b32 s92, v40, 40 +; GISEL-NEXT: v_readlane_b32 s91, v40, 39 +; GISEL-NEXT: v_readlane_b32 s90, v40, 38 +; GISEL-NEXT: v_readlane_b32 s89, v40, 37 +; GISEL-NEXT: v_readlane_b32 s88, v40, 36 +; GISEL-NEXT: v_readlane_b32 s79, v40, 35 +; GISEL-NEXT: v_readlane_b32 s78, v40, 34 +; GISEL-NEXT: v_readlane_b32 s77, v40, 33 +; GISEL-NEXT: v_readlane_b32 s76, v40, 32 +; GISEL-NEXT: v_readlane_b32 s75, v40, 31 +; GISEL-NEXT: v_readlane_b32 s74, v40, 30 +; GISEL-NEXT: v_readlane_b32 s73, v40, 29 +; GISEL-NEXT: v_readlane_b32 s72, v40, 28 ; GISEL-NEXT: v_readlane_b32 s31, v40, 27 ; GISEL-NEXT: v_readlane_b32 s30, v40, 26 ; GISEL-NEXT: v_readlane_b32 s29, v40, 25 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 2e3ca34af4c74..2322b29abaa10 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9093,62 +9093,30 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s37, 5 ; GFX9-NEXT: v_writelane_b32 v40, s38, 6 ; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s40, 8 -; GFX9-NEXT: v_writelane_b32 v40, s41, 9 -; GFX9-NEXT: v_writelane_b32 v40, s42, 10 -; GFX9-NEXT: v_writelane_b32 v40, s43, 11 -; GFX9-NEXT: v_writelane_b32 v40, s44, 12 -; GFX9-NEXT: v_writelane_b32 v40, s45, 13 -; GFX9-NEXT: v_writelane_b32 v40, s46, 14 -; GFX9-NEXT: v_writelane_b32 v40, s47, 15 -; GFX9-NEXT: v_writelane_b32 v40, s48, 16 -; GFX9-NEXT: v_writelane_b32 v40, s49, 17 -; GFX9-NEXT: v_writelane_b32 v40, s50, 18 -; GFX9-NEXT: v_writelane_b32 v40, s51, 19 -; GFX9-NEXT: v_writelane_b32 v40, s52, 20 -; GFX9-NEXT: v_writelane_b32 v40, s53, 21 -; GFX9-NEXT: v_writelane_b32 v40, s54, 22 -; GFX9-NEXT: v_writelane_b32 v40, s55, 23 -; GFX9-NEXT: v_writelane_b32 v40, s56, 24 -; GFX9-NEXT: v_writelane_b32 v40, s57, 25 -; GFX9-NEXT: v_writelane_b32 v40, s58, 26 -; GFX9-NEXT: v_writelane_b32 v40, s59, 27 -; GFX9-NEXT: v_writelane_b32 v40, s60, 28 -; GFX9-NEXT: v_writelane_b32 v40, s61, 29 +; GFX9-NEXT: v_writelane_b32 v40, s48, 8 +; GFX9-NEXT: v_writelane_b32 v40, s49, 9 +; GFX9-NEXT: v_writelane_b32 v40, s50, 10 +; GFX9-NEXT: v_writelane_b32 v40, s51, 11 +; GFX9-NEXT: v_writelane_b32 v40, s52, 12 +; GFX9-NEXT: v_writelane_b32 v40, s53, 13 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s62, 30 +; GFX9-NEXT: v_writelane_b32 v40, s54, 14 ; GFX9-NEXT: s_mov_b32 s5, byval_align16_f64_arg@abs32@hi ; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg@abs32@lo -; GFX9-NEXT: v_writelane_b32 v40, s63, 31 +; GFX9-NEXT: v_writelane_b32 v40, s55, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s63, v40, 31 -; GFX9-NEXT: v_readlane_b32 s62, v40, 30 -; GFX9-NEXT: v_readlane_b32 s61, v40, 29 -; GFX9-NEXT: v_readlane_b32 s60, v40, 28 -; GFX9-NEXT: v_readlane_b32 s59, v40, 27 -; GFX9-NEXT: v_readlane_b32 s58, v40, 26 -; GFX9-NEXT: v_readlane_b32 s57, v40, 25 -; GFX9-NEXT: v_readlane_b32 s56, v40, 24 -; GFX9-NEXT: v_readlane_b32 s55, v40, 23 -; GFX9-NEXT: v_readlane_b32 s54, v40, 22 -; GFX9-NEXT: v_readlane_b32 s53, v40, 21 -; GFX9-NEXT: v_readlane_b32 s52, v40, 20 -; GFX9-NEXT: v_readlane_b32 s51, v40, 19 -; GFX9-NEXT: v_readlane_b32 s50, v40, 18 -; GFX9-NEXT: v_readlane_b32 s49, v40, 17 -; GFX9-NEXT: v_readlane_b32 s48, v40, 16 -; GFX9-NEXT: v_readlane_b32 s47, v40, 15 -; GFX9-NEXT: v_readlane_b32 s46, v40, 14 -; GFX9-NEXT: v_readlane_b32 s45, v40, 13 -; GFX9-NEXT: v_readlane_b32 s44, v40, 12 -; GFX9-NEXT: v_readlane_b32 s43, v40, 11 -; GFX9-NEXT: v_readlane_b32 s42, v40, 10 -; GFX9-NEXT: v_readlane_b32 s41, v40, 9 -; GFX9-NEXT: v_readlane_b32 s40, v40, 8 +; GFX9-NEXT: v_readlane_b32 s55, v40, 15 +; GFX9-NEXT: v_readlane_b32 s54, v40, 14 +; GFX9-NEXT: v_readlane_b32 s53, v40, 13 +; GFX9-NEXT: v_readlane_b32 s52, v40, 12 +; GFX9-NEXT: v_readlane_b32 s51, v40, 11 +; GFX9-NEXT: v_readlane_b32 s50, v40, 10 +; GFX9-NEXT: v_readlane_b32 s49, v40, 9 +; GFX9-NEXT: v_readlane_b32 s48, v40, 8 ; GFX9-NEXT: v_readlane_b32 s39, v40, 7 ; GFX9-NEXT: v_readlane_b32 s38, v40, 6 ; GFX9-NEXT: v_readlane_b32 s37, v40, 5 @@ -9193,55 +9161,23 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s37, 5 ; GFX10-NEXT: v_writelane_b32 v40, s38, 6 ; GFX10-NEXT: v_writelane_b32 v40, s39, 7 -; GFX10-NEXT: v_writelane_b32 v40, s40, 8 -; GFX10-NEXT: v_writelane_b32 v40, s41, 9 -; GFX10-NEXT: v_writelane_b32 v40, s42, 10 -; GFX10-NEXT: v_writelane_b32 v40, s43, 11 -; GFX10-NEXT: v_writelane_b32 v40, s44, 12 -; GFX10-NEXT: v_writelane_b32 v40, s45, 13 -; GFX10-NEXT: v_writelane_b32 v40, s46, 14 -; GFX10-NEXT: v_writelane_b32 v40, s47, 15 -; GFX10-NEXT: v_writelane_b32 v40, s48, 16 -; GFX10-NEXT: v_writelane_b32 v40, s49, 17 -; GFX10-NEXT: v_writelane_b32 v40, s50, 18 -; GFX10-NEXT: v_writelane_b32 v40, s51, 19 -; GFX10-NEXT: v_writelane_b32 v40, s52, 20 -; GFX10-NEXT: v_writelane_b32 v40, s53, 21 -; GFX10-NEXT: v_writelane_b32 v40, s54, 22 -; GFX10-NEXT: v_writelane_b32 v40, s55, 23 -; GFX10-NEXT: v_writelane_b32 v40, s56, 24 -; GFX10-NEXT: v_writelane_b32 v40, s57, 25 -; GFX10-NEXT: v_writelane_b32 v40, s58, 26 -; GFX10-NEXT: v_writelane_b32 v40, s59, 27 -; GFX10-NEXT: v_writelane_b32 v40, s60, 28 -; GFX10-NEXT: v_writelane_b32 v40, s61, 29 -; GFX10-NEXT: v_writelane_b32 v40, s62, 30 -; GFX10-NEXT: v_writelane_b32 v40, s63, 31 +; GFX10-NEXT: v_writelane_b32 v40, s48, 8 +; GFX10-NEXT: v_writelane_b32 v40, s49, 9 +; GFX10-NEXT: v_writelane_b32 v40, s50, 10 +; GFX10-NEXT: v_writelane_b32 v40, s51, 11 +; GFX10-NEXT: v_writelane_b32 v40, s52, 12 +; GFX10-NEXT: v_writelane_b32 v40, s53, 13 +; GFX10-NEXT: v_writelane_b32 v40, s54, 14 +; GFX10-NEXT: v_writelane_b32 v40, s55, 15 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s63, v40, 31 -; GFX10-NEXT: v_readlane_b32 s62, v40, 30 -; GFX10-NEXT: v_readlane_b32 s61, v40, 29 -; GFX10-NEXT: v_readlane_b32 s60, v40, 28 -; GFX10-NEXT: v_readlane_b32 s59, v40, 27 -; GFX10-NEXT: v_readlane_b32 s58, v40, 26 -; GFX10-NEXT: v_readlane_b32 s57, v40, 25 -; GFX10-NEXT: v_readlane_b32 s56, v40, 24 -; GFX10-NEXT: v_readlane_b32 s55, v40, 23 -; GFX10-NEXT: v_readlane_b32 s54, v40, 22 -; GFX10-NEXT: v_readlane_b32 s53, v40, 21 -; GFX10-NEXT: v_readlane_b32 s52, v40, 20 -; GFX10-NEXT: v_readlane_b32 s51, v40, 19 -; GFX10-NEXT: v_readlane_b32 s50, v40, 18 -; GFX10-NEXT: v_readlane_b32 s49, v40, 17 -; GFX10-NEXT: v_readlane_b32 s48, v40, 16 -; GFX10-NEXT: v_readlane_b32 s47, v40, 15 -; GFX10-NEXT: v_readlane_b32 s46, v40, 14 -; GFX10-NEXT: v_readlane_b32 s45, v40, 13 -; GFX10-NEXT: v_readlane_b32 s44, v40, 12 -; GFX10-NEXT: v_readlane_b32 s43, v40, 11 -; GFX10-NEXT: v_readlane_b32 s42, v40, 10 -; GFX10-NEXT: v_readlane_b32 s41, v40, 9 -; GFX10-NEXT: v_readlane_b32 s40, v40, 8 +; GFX10-NEXT: v_readlane_b32 s55, v40, 15 +; GFX10-NEXT: v_readlane_b32 s54, v40, 14 +; GFX10-NEXT: v_readlane_b32 s53, v40, 13 +; GFX10-NEXT: v_readlane_b32 s52, v40, 12 +; GFX10-NEXT: v_readlane_b32 s51, v40, 11 +; GFX10-NEXT: v_readlane_b32 s50, v40, 10 +; GFX10-NEXT: v_readlane_b32 s49, v40, 9 +; GFX10-NEXT: v_readlane_b32 s48, v40, 8 ; GFX10-NEXT: v_readlane_b32 s39, v40, 7 ; GFX10-NEXT: v_readlane_b32 s38, v40, 6 ; GFX10-NEXT: v_readlane_b32 s37, v40, 5 @@ -9281,57 +9217,25 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s37, 5 ; GFX11-NEXT: v_writelane_b32 v40, s38, 6 ; GFX11-NEXT: v_writelane_b32 v40, s39, 7 -; GFX11-NEXT: v_writelane_b32 v40, s40, 8 -; GFX11-NEXT: v_writelane_b32 v40, s41, 9 -; GFX11-NEXT: v_writelane_b32 v40, s42, 10 -; GFX11-NEXT: v_writelane_b32 v40, s43, 11 -; GFX11-NEXT: v_writelane_b32 v40, s44, 12 -; GFX11-NEXT: v_writelane_b32 v40, s45, 13 -; GFX11-NEXT: v_writelane_b32 v40, s46, 14 -; GFX11-NEXT: v_writelane_b32 v40, s47, 15 -; GFX11-NEXT: v_writelane_b32 v40, s48, 16 -; GFX11-NEXT: v_writelane_b32 v40, s49, 17 -; GFX11-NEXT: v_writelane_b32 v40, s50, 18 -; GFX11-NEXT: v_writelane_b32 v40, s51, 19 -; GFX11-NEXT: v_writelane_b32 v40, s52, 20 -; GFX11-NEXT: v_writelane_b32 v40, s53, 21 -; GFX11-NEXT: v_writelane_b32 v40, s54, 22 -; GFX11-NEXT: v_writelane_b32 v40, s55, 23 -; GFX11-NEXT: v_writelane_b32 v40, s56, 24 -; GFX11-NEXT: v_writelane_b32 v40, s57, 25 -; GFX11-NEXT: v_writelane_b32 v40, s58, 26 -; GFX11-NEXT: v_writelane_b32 v40, s59, 27 -; GFX11-NEXT: v_writelane_b32 v40, s60, 28 -; GFX11-NEXT: v_writelane_b32 v40, s61, 29 -; GFX11-NEXT: v_writelane_b32 v40, s62, 30 -; GFX11-NEXT: v_writelane_b32 v40, s63, 31 +; GFX11-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-NEXT: v_writelane_b32 v40, s54, 14 +; GFX11-NEXT: v_writelane_b32 v40, s55, 15 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s63, v40, 31 -; GFX11-NEXT: v_readlane_b32 s62, v40, 30 -; GFX11-NEXT: v_readlane_b32 s61, v40, 29 -; GFX11-NEXT: v_readlane_b32 s60, v40, 28 -; GFX11-NEXT: v_readlane_b32 s59, v40, 27 -; GFX11-NEXT: v_readlane_b32 s58, v40, 26 -; GFX11-NEXT: v_readlane_b32 s57, v40, 25 -; GFX11-NEXT: v_readlane_b32 s56, v40, 24 -; GFX11-NEXT: v_readlane_b32 s55, v40, 23 -; GFX11-NEXT: v_readlane_b32 s54, v40, 22 -; GFX11-NEXT: v_readlane_b32 s53, v40, 21 -; GFX11-NEXT: v_readlane_b32 s52, v40, 20 -; GFX11-NEXT: v_readlane_b32 s51, v40, 19 -; GFX11-NEXT: v_readlane_b32 s50, v40, 18 -; GFX11-NEXT: v_readlane_b32 s49, v40, 17 -; GFX11-NEXT: v_readlane_b32 s48, v40, 16 -; GFX11-NEXT: v_readlane_b32 s47, v40, 15 -; GFX11-NEXT: v_readlane_b32 s46, v40, 14 -; GFX11-NEXT: v_readlane_b32 s45, v40, 13 -; GFX11-NEXT: v_readlane_b32 s44, v40, 12 -; GFX11-NEXT: v_readlane_b32 s43, v40, 11 -; GFX11-NEXT: v_readlane_b32 s42, v40, 10 -; GFX11-NEXT: v_readlane_b32 s41, v40, 9 -; GFX11-NEXT: v_readlane_b32 s40, v40, 8 +; GFX11-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-NEXT: v_readlane_b32 s48, v40, 8 ; GFX11-NEXT: v_readlane_b32 s39, v40, 7 ; GFX11-NEXT: v_readlane_b32 s38, v40, 6 ; GFX11-NEXT: v_readlane_b32 s37, v40, 5 @@ -9371,57 +9275,25 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 6 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 9 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 10 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s43, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s44, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s45, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s46, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s47, 15 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 17 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 18 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 19 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 20 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 21 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 22 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 23 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s56, 24 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s57, 25 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s58, 26 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s59, 27 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s60, 28 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s61, 29 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s62, 30 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s63, 31 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s63, v40, 31 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s62, v40, 30 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s61, v40, 29 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s60, v40, 28 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s59, v40, 27 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s58, v40, 26 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s57, v40, 25 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s56, v40, 24 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 23 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 22 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 21 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 20 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 19 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 18 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 17 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s47, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s46, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s45, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s44, v40, 12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s43, v40, 11 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s42, v40, 10 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s41, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s40, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 8 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 5 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 15be44a335a1d..4f5c46d5f424f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -735,12 +735,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -760,8 +760,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -1955,12 +1955,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1980,8 +1980,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -3235,12 +3235,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -3260,8 +3260,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -4011,12 +4011,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -4036,8 +4036,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5316,12 +5316,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -5341,8 +5341,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5747,14 +5747,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -5763,16 +5763,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -5785,11 +5785,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -5802,42 +5802,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 @@ -5845,21 +5845,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s42, s9 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s50, s9 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -5872,68 +5872,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b32 s33, s10 ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -5950,69 +5950,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6029,37 +6029,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm @@ -6070,7 +6070,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6079,16 +6079,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6111,18 +6111,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6130,8 +6130,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6142,24 +6142,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6179,25 +6179,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6206,14 +6206,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6222,16 +6222,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6244,11 +6244,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6261,42 +6261,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -6304,21 +6304,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6331,68 +6331,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6409,69 +6409,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6488,37 +6488,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6529,7 +6529,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6538,16 +6538,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6570,18 +6570,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6589,8 +6589,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6601,24 +6601,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6638,25 +6638,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6669,19 +6669,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6692,15 +6692,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -6725,21 +6725,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6752,44 +6752,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -6801,14 +6801,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -6833,11 +6833,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6848,53 +6848,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -6906,14 +6906,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -6938,11 +6938,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -6952,55 +6952,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7012,14 +7012,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7038,16 +7038,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7057,37 +7057,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm @@ -7095,11 +7095,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7107,11 +7107,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -7143,11 +7143,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7164,18 +7164,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7183,8 +7183,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -7233,17 +7233,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7258,25 +7258,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7285,22 +7285,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7311,30 +7311,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7347,44 +7347,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7396,14 +7396,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7450,74 +7450,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7529,14 +7529,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7581,10 +7581,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7594,55 +7594,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7654,14 +7654,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7696,14 +7696,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7713,37 +7713,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7763,11 +7763,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -7825,10 +7825,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7845,18 +7845,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7864,8 +7864,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7874,7 +7874,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7883,9 +7883,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -7936,14 +7936,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7958,25 +7958,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8922,12 +8922,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -8947,8 +8947,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -10355,12 +10355,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -10380,8 +10380,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11270,12 +11270,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -11295,8 +11295,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11771,13 +11771,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -11791,15 +11791,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -11811,11 +11811,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -11828,40 +11828,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -11874,20 +11874,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -11900,48 +11900,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -11953,19 +11953,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -11978,53 +11978,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12033,18 +12033,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12057,44 +12057,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12115,16 +12115,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12145,18 +12145,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12164,8 +12164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12173,12 +12173,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12193,15 +12193,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12219,25 +12219,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12246,13 +12246,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12266,15 +12266,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12286,11 +12286,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12303,40 +12303,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12349,20 +12349,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12375,48 +12375,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12428,19 +12428,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12453,53 +12453,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12508,18 +12508,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12532,44 +12532,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12590,16 +12590,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12620,18 +12620,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12639,8 +12639,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12648,12 +12648,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12668,15 +12668,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12694,25 +12694,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12725,19 +12725,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -12748,15 +12748,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -12781,21 +12781,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12808,44 +12808,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -12857,14 +12857,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -12889,11 +12889,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -12904,53 +12904,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -12962,14 +12962,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -12994,11 +12994,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13008,55 +13008,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13068,14 +13068,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13094,16 +13094,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13113,37 +13113,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm @@ -13151,11 +13151,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13163,11 +13163,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -13199,11 +13199,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13220,18 +13220,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13239,8 +13239,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13249,7 +13249,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13258,9 +13258,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -13289,17 +13289,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13314,25 +13314,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13341,22 +13341,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13367,30 +13367,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13403,44 +13403,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13452,14 +13452,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13506,74 +13506,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13585,14 +13585,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13637,10 +13637,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13650,55 +13650,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13710,14 +13710,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13752,14 +13752,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13769,37 +13769,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -13807,11 +13807,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13819,11 +13819,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -13881,10 +13881,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -13901,18 +13901,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13920,8 +13920,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13930,7 +13930,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13939,9 +13939,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -13992,14 +13992,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14014,25 +14014,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index a4410bb9ed2d0..e1ba4a2b0bf2a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3328,38 +3328,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm @@ -3367,31 +3367,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3768,38 +3768,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -3807,31 +3807,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] @@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm @@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 @@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6876,38 +6876,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm @@ -6915,31 +6915,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7316,38 +7316,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7355,31 +7355,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] @@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm @@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 68d7dcc60506c..6b1d5253e178f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3328,38 +3328,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm @@ -3367,31 +3367,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3768,38 +3768,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -3807,31 +3807,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] @@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm @@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 @@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6876,38 +6876,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm @@ -6915,31 +6915,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7316,38 +7316,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7355,31 +7355,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] @@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm @@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 7126680525b87..d575605f102b7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -821,12 +821,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -846,8 +846,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -2153,12 +2153,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2178,8 +2178,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -3485,12 +3485,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -3510,8 +3510,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -4313,12 +4313,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -4338,8 +4338,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5644,12 +5644,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -5669,8 +5669,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -6075,14 +6075,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6091,16 +6091,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6113,11 +6113,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6130,42 +6130,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 @@ -6173,21 +6173,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s42, s9 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s50, s9 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6200,68 +6200,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b32 s33, s10 ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6278,69 +6278,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6357,37 +6357,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm @@ -6398,7 +6398,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6407,16 +6407,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6439,18 +6439,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6458,8 +6458,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6470,24 +6470,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6507,25 +6507,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6534,14 +6534,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6550,16 +6550,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6572,11 +6572,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6589,42 +6589,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -6632,21 +6632,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6659,68 +6659,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6737,69 +6737,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6816,37 +6816,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6857,7 +6857,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6866,16 +6866,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6898,18 +6898,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6917,8 +6917,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6929,24 +6929,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6966,25 +6966,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6997,19 +6997,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7020,15 +7020,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7053,21 +7053,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7080,44 +7080,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7129,14 +7129,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7161,11 +7161,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7176,53 +7176,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7234,14 +7234,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7266,11 +7266,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7280,55 +7280,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7340,14 +7340,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7366,16 +7366,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7385,37 +7385,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm @@ -7423,11 +7423,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7435,11 +7435,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -7471,11 +7471,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7492,18 +7492,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7511,8 +7511,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7521,7 +7521,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7530,9 +7530,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -7561,17 +7561,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7586,25 +7586,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7613,22 +7613,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7639,30 +7639,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7675,44 +7675,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7724,14 +7724,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7778,74 +7778,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7857,14 +7857,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7909,10 +7909,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7922,55 +7922,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7982,14 +7982,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8024,14 +8024,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8041,37 +8041,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8079,11 +8079,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -8091,11 +8091,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8153,10 +8153,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8173,18 +8173,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8192,8 +8192,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8202,7 +8202,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8211,9 +8211,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8264,14 +8264,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8286,25 +8286,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -9249,12 +9249,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -9274,8 +9274,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -10682,12 +10682,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -10707,8 +10707,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11597,12 +11597,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -11622,8 +11622,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -12097,13 +12097,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -12117,15 +12117,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12137,11 +12137,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12154,40 +12154,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -12200,20 +12200,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12226,48 +12226,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -12279,19 +12279,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -12304,53 +12304,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12359,18 +12359,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12383,44 +12383,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12441,16 +12441,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12471,18 +12471,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12490,8 +12490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12499,12 +12499,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12519,15 +12519,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12545,25 +12545,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12572,13 +12572,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12592,15 +12592,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12612,11 +12612,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12629,40 +12629,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12675,20 +12675,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12701,48 +12701,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12754,19 +12754,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12779,53 +12779,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12834,18 +12834,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12858,44 +12858,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12916,16 +12916,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12946,18 +12946,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12965,8 +12965,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12974,12 +12974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12994,15 +12994,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -13020,25 +13020,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13051,19 +13051,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -13074,15 +13074,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -13107,21 +13107,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13134,44 +13134,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13183,14 +13183,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13215,11 +13215,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -13230,53 +13230,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13288,14 +13288,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13320,11 +13320,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13334,55 +13334,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13394,14 +13394,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13420,16 +13420,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13439,37 +13439,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm @@ -13477,11 +13477,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13489,11 +13489,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -13525,11 +13525,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13546,18 +13546,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13565,8 +13565,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13575,7 +13575,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13584,9 +13584,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -13615,17 +13615,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13640,25 +13640,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13667,22 +13667,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13693,30 +13693,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13729,44 +13729,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13778,14 +13778,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13832,74 +13832,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13911,14 +13911,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13963,10 +13963,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13976,55 +13976,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -14036,14 +14036,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -14078,14 +14078,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -14095,37 +14095,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -14133,11 +14133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -14145,11 +14145,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -14207,10 +14207,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14227,18 +14227,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -14246,8 +14246,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -14256,7 +14256,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -14265,9 +14265,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -14318,14 +14318,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14340,25 +14340,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index dde84af57ed25..da1175c02e94a 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -31,102 +31,105 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr34_sgpr35 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr33 = COPY $sgpr15 - ; CHECK-NEXT: renamable $sgpr42 = COPY $sgpr14 + ; CHECK-NEXT: renamable $sgpr50 = COPY $sgpr14 ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: renamable $sgpr40_sgpr41 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: renamable $sgpr66_sgpr67 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) - ; CHECK-NEXT: renamable $sgpr44 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr45 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr46 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr47 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr48 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr49 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr50 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr51 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr52 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr53 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr54 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr55 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr58 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr59 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr60 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr61 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr62 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr63 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr64 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr68_sgpr69 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: renamable $sgpr64_sgpr65 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: renamable $sgpr68 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr69 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr70 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr71 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr72 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr73 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr74 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr75 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr76 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr77 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr78 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr79 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr80 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr81 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr82 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr83 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr84 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr85 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr86 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr87 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr88 = S_MOV_B32 0 + ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = IMPLICIT_DEF ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY killed renamable $sgpr40_sgpr41 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr38_sgpr39 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY killed renamable $sgpr34_sgpr35 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr36_sgpr37 - ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr42 + ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr50 ; CHECK-NEXT: $sgpr13 = COPY killed renamable $sgpr33 - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $exec, implicit-def $exec ; CHECK-NEXT: dead renamable $sgpr6_sgpr7 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 - ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 + ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_OR_SAVEEXEC_B64 renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: renamable $sgpr68 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr69 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr70 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr71 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr72 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr73 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr74 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr75 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr80 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr81 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr82 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr83 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr84 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr85 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr86 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr87 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr88 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr89 = COPY renamable $sgpr44 - ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec + ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr51 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr53 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr54 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $sgpr6_sgpr7, $sgpr66_sgpr67:0x0000000000000003, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr6_sgpr7, $sgpr64_sgpr65:0x0000000000000003 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc - ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 + ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: bb.0: @@ -211,7 +214,6 @@ body: | %15.sub19:sgpr_1024 = COPY %7.sub0 %15.sub20:sgpr_1024 = COPY %7.sub0 %15.sub21:sgpr_1024 = COPY %7.sub0 - ; Spill code ends up getting inserted here, and we end up with many unspillable sgpr1024 ranges %16:vreg_1024 = COPY %15, implicit $exec $exec = S_XOR_B64_term $exec, %14, implicit-def $scc S_CBRANCH_EXECZ %bb.5, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 81eac63ae5bdf..8dbd6c5d133ea 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -16,28 +16,20 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v5, s36, 4 ; CHECK-NEXT: v_writelane_b32 v5, s37, 5 ; CHECK-NEXT: v_writelane_b32 v5, s38, 6 -; CHECK-NEXT: v_writelane_b32 v5, s39, 7 -; CHECK-NEXT: v_writelane_b32 v5, s40, 8 -; CHECK-NEXT: v_writelane_b32 v5, s41, 9 -; CHECK-NEXT: v_writelane_b32 v5, s42, 10 -; CHECK-NEXT: v_writelane_b32 v5, s43, 11 -; CHECK-NEXT: v_writelane_b32 v5, s44, 12 -; CHECK-NEXT: v_writelane_b32 v5, s45, 13 -; CHECK-NEXT: v_writelane_b32 v5, s46, 14 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s47, 15 +; CHECK-NEXT: v_writelane_b32 v5, s39, 7 ; CHECK-NEXT: s_movk_i32 s20, 0xf0 ; CHECK-NEXT: s_mov_b32 s21, s24 -; CHECK-NEXT: v_writelane_b32 v5, s48, 16 +; CHECK-NEXT: v_writelane_b32 v5, s48, 8 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: v_writelane_b32 v5, s49, 17 +; CHECK-NEXT: v_writelane_b32 v5, s49, 9 ; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s50, 18 +; CHECK-NEXT: v_writelane_b32 v5, s50, 10 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s22, 0x130 ; CHECK-NEXT: s_mov_b32 s23, s24 -; CHECK-NEXT: v_writelane_b32 v5, s51, 19 +; CHECK-NEXT: v_writelane_b32 v5, s51, 11 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 ; CHECK-NEXT: s_mov_b32 s28, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 @@ -49,60 +41,51 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v5, s52, 20 +; CHECK-NEXT: v_writelane_b32 v5, s52, 12 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v6, s36, 0 -; CHECK-NEXT: v_writelane_b32 v5, s53, 21 -; CHECK-NEXT: v_writelane_b32 v5, s54, 22 -; CHECK-NEXT: v_writelane_b32 v5, s55, 23 -; CHECK-NEXT: v_writelane_b32 v5, s56, 24 ; CHECK-NEXT: v_writelane_b32 v6, s37, 1 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v5, s57, 25 ; CHECK-NEXT: v_writelane_b32 v6, s38, 2 -; CHECK-NEXT: v_writelane_b32 v5, s58, 26 ; CHECK-NEXT: v_writelane_b32 v6, s39, 3 -; CHECK-NEXT: v_writelane_b32 v5, s59, 27 ; CHECK-NEXT: v_writelane_b32 v6, s40, 4 -; CHECK-NEXT: v_writelane_b32 v5, s60, 28 ; CHECK-NEXT: v_writelane_b32 v6, s41, 5 -; CHECK-NEXT: v_writelane_b32 v5, s61, 29 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1 ; CHECK-NEXT: v_writelane_b32 v6, s42, 6 -; CHECK-NEXT: v_writelane_b32 v5, s62, 30 ; CHECK-NEXT: v_writelane_b32 v6, s43, 7 -; CHECK-NEXT: v_writelane_b32 v5, s63, 31 ; CHECK-NEXT: v_writelane_b32 v6, s44, 8 -; CHECK-NEXT: v_writelane_b32 v5, s64, 32 ; CHECK-NEXT: v_writelane_b32 v6, s45, 9 -; CHECK-NEXT: v_writelane_b32 v5, s65, 33 +; CHECK-NEXT: v_writelane_b32 v5, s53, 13 ; CHECK-NEXT: v_writelane_b32 v6, s46, 10 -; CHECK-NEXT: v_writelane_b32 v5, s66, 34 +; CHECK-NEXT: v_writelane_b32 v5, s54, 14 ; CHECK-NEXT: v_writelane_b32 v6, s47, 11 -; CHECK-NEXT: v_writelane_b32 v5, s67, 35 +; CHECK-NEXT: v_writelane_b32 v5, s55, 15 ; CHECK-NEXT: v_writelane_b32 v6, s48, 12 -; CHECK-NEXT: v_writelane_b32 v5, s68, 36 +; CHECK-NEXT: v_writelane_b32 v5, s64, 16 ; CHECK-NEXT: v_writelane_b32 v6, s49, 13 -; CHECK-NEXT: v_writelane_b32 v5, s69, 37 +; CHECK-NEXT: v_writelane_b32 v5, s65, 17 ; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: s_mov_b32 s34, 48 -; CHECK-NEXT: s_movk_i32 s52, 0x1f0 -; CHECK-NEXT: s_movk_i32 s68, 0x2f0 -; CHECK-NEXT: s_mov_b32 s35, s24 -; CHECK-NEXT: s_mov_b32 s53, s24 -; CHECK-NEXT: s_mov_b32 s69, s24 +; CHECK-NEXT: v_writelane_b32 v5, s66, 18 ; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_load_dwordx8 s[20:27], s[34:35], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[52:53], 0x0 +; CHECK-NEXT: s_mov_b32 s40, 48 +; CHECK-NEXT: s_movk_i32 s56, 0x1f0 +; CHECK-NEXT: s_movk_i32 s34, 0x2f0 +; CHECK-NEXT: s_mov_b32 s41, s24 +; CHECK-NEXT: s_mov_b32 s57, s24 +; CHECK-NEXT: s_mov_b32 s35, s24 +; CHECK-NEXT: v_writelane_b32 v5, s67, 19 +; CHECK-NEXT: s_load_dwordx8 s[20:27], s[40:41], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[34:35], 0x0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: v_writelane_b32 v5, s70, 38 -; CHECK-NEXT: s_xor_b64 s[34:35], vcc, -1 -; CHECK-NEXT: v_writelane_b32 v5, s71, 39 +; CHECK-NEXT: v_writelane_b32 v5, s68, 20 +; CHECK-NEXT: s_xor_b64 s[72:73], vcc, -1 +; CHECK-NEXT: v_writelane_b32 v5, s69, 21 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 -; CHECK-NEXT: s_and_saveexec_b64 vcc, s[34:35] -; CHECK-NEXT: s_xor_b64 s[68:69], exec, vcc +; CHECK-NEXT: s_and_saveexec_b64 vcc, s[72:73] +; CHECK-NEXT: s_xor_b64 s[34:35], exec, vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1 @@ -124,10 +107,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[68:69] +; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[34:35] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[34:35] +; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[72:73] ; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 @@ -201,39 +184,21 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] -; CHECK-NEXT: v_readlane_b32 s71, v5, 39 -; CHECK-NEXT: v_readlane_b32 s70, v5, 38 -; CHECK-NEXT: v_readlane_b32 s69, v5, 37 -; CHECK-NEXT: v_readlane_b32 s68, v5, 36 +; CHECK-NEXT: v_readlane_b32 s69, v5, 21 +; CHECK-NEXT: v_readlane_b32 s68, v5, 20 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s67, v5, 35 -; CHECK-NEXT: v_readlane_b32 s66, v5, 34 -; CHECK-NEXT: v_readlane_b32 s65, v5, 33 -; CHECK-NEXT: v_readlane_b32 s64, v5, 32 -; CHECK-NEXT: v_readlane_b32 s63, v5, 31 -; CHECK-NEXT: v_readlane_b32 s62, v5, 30 -; CHECK-NEXT: v_readlane_b32 s61, v5, 29 -; CHECK-NEXT: v_readlane_b32 s60, v5, 28 -; CHECK-NEXT: v_readlane_b32 s59, v5, 27 -; CHECK-NEXT: v_readlane_b32 s58, v5, 26 -; CHECK-NEXT: v_readlane_b32 s57, v5, 25 -; CHECK-NEXT: v_readlane_b32 s56, v5, 24 -; CHECK-NEXT: v_readlane_b32 s55, v5, 23 -; CHECK-NEXT: v_readlane_b32 s54, v5, 22 -; CHECK-NEXT: v_readlane_b32 s53, v5, 21 -; CHECK-NEXT: v_readlane_b32 s52, v5, 20 -; CHECK-NEXT: v_readlane_b32 s51, v5, 19 -; CHECK-NEXT: v_readlane_b32 s50, v5, 18 -; CHECK-NEXT: v_readlane_b32 s49, v5, 17 -; CHECK-NEXT: v_readlane_b32 s48, v5, 16 -; CHECK-NEXT: v_readlane_b32 s47, v5, 15 -; CHECK-NEXT: v_readlane_b32 s46, v5, 14 -; CHECK-NEXT: v_readlane_b32 s45, v5, 13 -; CHECK-NEXT: v_readlane_b32 s44, v5, 12 -; CHECK-NEXT: v_readlane_b32 s43, v5, 11 -; CHECK-NEXT: v_readlane_b32 s42, v5, 10 -; CHECK-NEXT: v_readlane_b32 s41, v5, 9 -; CHECK-NEXT: v_readlane_b32 s40, v5, 8 +; CHECK-NEXT: v_readlane_b32 s67, v5, 19 +; CHECK-NEXT: v_readlane_b32 s66, v5, 18 +; CHECK-NEXT: v_readlane_b32 s65, v5, 17 +; CHECK-NEXT: v_readlane_b32 s64, v5, 16 +; CHECK-NEXT: v_readlane_b32 s55, v5, 15 +; CHECK-NEXT: v_readlane_b32 s54, v5, 14 +; CHECK-NEXT: v_readlane_b32 s53, v5, 13 +; CHECK-NEXT: v_readlane_b32 s52, v5, 12 +; CHECK-NEXT: v_readlane_b32 s51, v5, 11 +; CHECK-NEXT: v_readlane_b32 s50, v5, 10 +; CHECK-NEXT: v_readlane_b32 s49, v5, 9 +; CHECK-NEXT: v_readlane_b32 s48, v5, 8 ; CHECK-NEXT: v_readlane_b32 s39, v5, 7 ; CHECK-NEXT: v_readlane_b32 s38, v5, 6 ; CHECK-NEXT: v_readlane_b32 s37, v5, 5 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 55da485b91f67..d7c4f6afbdade 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -136,55 +136,55 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s64, 16 +; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB2_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: s_mov_b64 exec, s[54:55] +; GCN-NEXT: v_readlane_b32 s65, v40, 17 +; GCN-NEXT: v_readlane_b32 s64, v40, 16 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -220,55 +220,55 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s64, 16 +; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB2_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: s_mov_b64 exec, s[54:55] +; GISEL-NEXT: v_readlane_b32 s65, v40, 17 +; GISEL-NEXT: v_readlane_b32 s64, v40, 16 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -308,58 +308,58 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s64, 16 +; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: v_mov_b32_e32 v2, 0x7b ; GCN-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: s_mov_b64 exec, s[54:55] +; GCN-NEXT: v_readlane_b32 s65, v40, 17 +; GCN-NEXT: v_readlane_b32 s64, v40, 16 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -395,56 +395,56 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s64, 16 +; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB3_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: s_mov_b64 exec, s[54:55] +; GISEL-NEXT: v_readlane_b32 s65, v40, 17 +; GISEL-NEXT: v_readlane_b32 s64, v40, 16 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -484,57 +484,57 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s64, 16 +; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB4_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: s_mov_b64 exec, s[54:55] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s65, v40, 17 +; GCN-NEXT: v_readlane_b32 s64, v40, 16 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -570,57 +570,57 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s64, 16 +; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB4_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] +; GISEL-NEXT: s_mov_b64 exec, s[54:55] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: v_readlane_b32 s65, v40, 17 +; GISEL-NEXT: v_readlane_b32 s64, v40, 16 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -661,66 +661,66 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s64, 16 +; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: v_writelane_b32 v40, s66, 18 +; GCN-NEXT: v_writelane_b32 v40, s67, 19 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc +; GCN-NEXT: s_and_saveexec_b64 s[54:55], vcc ; GCN-NEXT: s_cbranch_execz .LBB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: s_mov_b64 s[48:49], exec +; GCN-NEXT: s_mov_b64 s[64:65], exec ; GCN-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[66:67], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] +; GCN-NEXT: s_xor_b64 exec, exec, s[66:67] ; GCN-NEXT: s_cbranch_execnz .LBB5_2 ; GCN-NEXT: ; %bb.3: -; GCN-NEXT: s_mov_b64 exec, s[48:49] +; GCN-NEXT: s_mov_b64 exec, s[64:65] ; GCN-NEXT: .LBB5_4: ; %bb2 -; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[54:55] +; GCN-NEXT: v_readlane_b32 s67, v40, 19 +; GCN-NEXT: v_readlane_b32 s66, v40, 18 +; GCN-NEXT: v_readlane_b32 s65, v40, 17 +; GCN-NEXT: v_readlane_b32 s64, v40, 16 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -756,66 +756,66 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s64, 16 +; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: v_writelane_b32 v40, s66, 18 +; GISEL-NEXT: v_writelane_b32 v40, s67, 19 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[54:55], vcc ; GISEL-NEXT: s_cbranch_execz .LBB5_4 ; GISEL-NEXT: ; %bb.1: ; %bb1 -; GISEL-NEXT: s_mov_b64 s[48:49], exec +; GISEL-NEXT: s_mov_b64 s[64:65], exec ; GISEL-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[50:51], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[66:67], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[50:51] +; GISEL-NEXT: s_xor_b64 exec, exec, s[66:67] ; GISEL-NEXT: s_cbranch_execnz .LBB5_2 ; GISEL-NEXT: ; %bb.3: -; GISEL-NEXT: s_mov_b64 exec, s[48:49] +; GISEL-NEXT: s_mov_b64 exec, s[64:65] ; GISEL-NEXT: .LBB5_4: ; %bb2 -; GISEL-NEXT: s_or_b64 exec, exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: s_or_b64 exec, exec, s[54:55] +; GISEL-NEXT: v_readlane_b32 s67, v40, 19 +; GISEL-NEXT: v_readlane_b32 s66, v40, 18 +; GISEL-NEXT: v_readlane_b32 s65, v40, 17 +; GISEL-NEXT: v_readlane_b32 s64, v40, 16 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -861,30 +861,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 @@ -898,30 +882,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -955,30 +923,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 ; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 @@ -992,30 +944,14 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -1054,30 +990,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v41, s37, 5 ; GCN-NEXT: v_writelane_b32 v41, s38, 6 ; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s40, 8 -; GCN-NEXT: v_writelane_b32 v41, s41, 9 -; GCN-NEXT: v_writelane_b32 v41, s42, 10 -; GCN-NEXT: v_writelane_b32 v41, s43, 11 -; GCN-NEXT: v_writelane_b32 v41, s44, 12 -; GCN-NEXT: v_writelane_b32 v41, s45, 13 -; GCN-NEXT: v_writelane_b32 v41, s46, 14 -; GCN-NEXT: v_writelane_b32 v41, s47, 15 -; GCN-NEXT: v_writelane_b32 v41, s48, 16 -; GCN-NEXT: v_writelane_b32 v41, s49, 17 -; GCN-NEXT: v_writelane_b32 v41, s50, 18 -; GCN-NEXT: v_writelane_b32 v41, s51, 19 -; GCN-NEXT: v_writelane_b32 v41, s52, 20 -; GCN-NEXT: v_writelane_b32 v41, s53, 21 -; GCN-NEXT: v_writelane_b32 v41, s54, 22 -; GCN-NEXT: v_writelane_b32 v41, s55, 23 -; GCN-NEXT: v_writelane_b32 v41, s56, 24 -; GCN-NEXT: v_writelane_b32 v41, s57, 25 -; GCN-NEXT: v_writelane_b32 v41, s58, 26 -; GCN-NEXT: v_writelane_b32 v41, s59, 27 -; GCN-NEXT: v_writelane_b32 v41, s60, 28 -; GCN-NEXT: v_writelane_b32 v41, s61, 29 -; GCN-NEXT: v_writelane_b32 v41, s62, 30 -; GCN-NEXT: v_writelane_b32 v41, s63, 31 +; GCN-NEXT: v_writelane_b32 v41, s48, 8 +; GCN-NEXT: v_writelane_b32 v41, s49, 9 +; GCN-NEXT: v_writelane_b32 v41, s50, 10 +; GCN-NEXT: v_writelane_b32 v41, s51, 11 +; GCN-NEXT: v_writelane_b32 v41, s52, 12 +; GCN-NEXT: v_writelane_b32 v41, s53, 13 +; GCN-NEXT: v_writelane_b32 v41, s54, 14 +; GCN-NEXT: v_writelane_b32 v41, s55, 15 ; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1093,30 +1013,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: v_readlane_b32 s63, v41, 31 -; GCN-NEXT: v_readlane_b32 s62, v41, 30 -; GCN-NEXT: v_readlane_b32 s61, v41, 29 -; GCN-NEXT: v_readlane_b32 s60, v41, 28 -; GCN-NEXT: v_readlane_b32 s59, v41, 27 -; GCN-NEXT: v_readlane_b32 s58, v41, 26 -; GCN-NEXT: v_readlane_b32 s57, v41, 25 -; GCN-NEXT: v_readlane_b32 s56, v41, 24 -; GCN-NEXT: v_readlane_b32 s55, v41, 23 -; GCN-NEXT: v_readlane_b32 s54, v41, 22 -; GCN-NEXT: v_readlane_b32 s53, v41, 21 -; GCN-NEXT: v_readlane_b32 s52, v41, 20 -; GCN-NEXT: v_readlane_b32 s51, v41, 19 -; GCN-NEXT: v_readlane_b32 s50, v41, 18 -; GCN-NEXT: v_readlane_b32 s49, v41, 17 -; GCN-NEXT: v_readlane_b32 s48, v41, 16 -; GCN-NEXT: v_readlane_b32 s47, v41, 15 -; GCN-NEXT: v_readlane_b32 s46, v41, 14 -; GCN-NEXT: v_readlane_b32 s45, v41, 13 -; GCN-NEXT: v_readlane_b32 s44, v41, 12 -; GCN-NEXT: v_readlane_b32 s43, v41, 11 -; GCN-NEXT: v_readlane_b32 s42, v41, 10 -; GCN-NEXT: v_readlane_b32 s41, v41, 9 -; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s55, v41, 15 +; GCN-NEXT: v_readlane_b32 s54, v41, 14 +; GCN-NEXT: v_readlane_b32 s53, v41, 13 +; GCN-NEXT: v_readlane_b32 s52, v41, 12 +; GCN-NEXT: v_readlane_b32 s51, v41, 11 +; GCN-NEXT: v_readlane_b32 s50, v41, 10 +; GCN-NEXT: v_readlane_b32 s49, v41, 9 +; GCN-NEXT: v_readlane_b32 s48, v41, 8 ; GCN-NEXT: v_readlane_b32 s39, v41, 7 ; GCN-NEXT: v_readlane_b32 s38, v41, 6 ; GCN-NEXT: v_readlane_b32 s37, v41, 5 @@ -1152,30 +1056,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v41, s37, 5 ; GISEL-NEXT: v_writelane_b32 v41, s38, 6 ; GISEL-NEXT: v_writelane_b32 v41, s39, 7 -; GISEL-NEXT: v_writelane_b32 v41, s40, 8 -; GISEL-NEXT: v_writelane_b32 v41, s41, 9 -; GISEL-NEXT: v_writelane_b32 v41, s42, 10 -; GISEL-NEXT: v_writelane_b32 v41, s43, 11 -; GISEL-NEXT: v_writelane_b32 v41, s44, 12 -; GISEL-NEXT: v_writelane_b32 v41, s45, 13 -; GISEL-NEXT: v_writelane_b32 v41, s46, 14 -; GISEL-NEXT: v_writelane_b32 v41, s47, 15 -; GISEL-NEXT: v_writelane_b32 v41, s48, 16 -; GISEL-NEXT: v_writelane_b32 v41, s49, 17 -; GISEL-NEXT: v_writelane_b32 v41, s50, 18 -; GISEL-NEXT: v_writelane_b32 v41, s51, 19 -; GISEL-NEXT: v_writelane_b32 v41, s52, 20 -; GISEL-NEXT: v_writelane_b32 v41, s53, 21 -; GISEL-NEXT: v_writelane_b32 v41, s54, 22 -; GISEL-NEXT: v_writelane_b32 v41, s55, 23 -; GISEL-NEXT: v_writelane_b32 v41, s56, 24 -; GISEL-NEXT: v_writelane_b32 v41, s57, 25 -; GISEL-NEXT: v_writelane_b32 v41, s58, 26 -; GISEL-NEXT: v_writelane_b32 v41, s59, 27 -; GISEL-NEXT: v_writelane_b32 v41, s60, 28 -; GISEL-NEXT: v_writelane_b32 v41, s61, 29 -; GISEL-NEXT: v_writelane_b32 v41, s62, 30 -; GISEL-NEXT: v_writelane_b32 v41, s63, 31 +; GISEL-NEXT: v_writelane_b32 v41, s48, 8 +; GISEL-NEXT: v_writelane_b32 v41, s49, 9 +; GISEL-NEXT: v_writelane_b32 v41, s50, 10 +; GISEL-NEXT: v_writelane_b32 v41, s51, 11 +; GISEL-NEXT: v_writelane_b32 v41, s52, 12 +; GISEL-NEXT: v_writelane_b32 v41, s53, 13 +; GISEL-NEXT: v_writelane_b32 v41, s54, 14 +; GISEL-NEXT: v_writelane_b32 v41, s55, 15 ; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1191,30 +1079,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v40 -; GISEL-NEXT: v_readlane_b32 s63, v41, 31 -; GISEL-NEXT: v_readlane_b32 s62, v41, 30 -; GISEL-NEXT: v_readlane_b32 s61, v41, 29 -; GISEL-NEXT: v_readlane_b32 s60, v41, 28 -; GISEL-NEXT: v_readlane_b32 s59, v41, 27 -; GISEL-NEXT: v_readlane_b32 s58, v41, 26 -; GISEL-NEXT: v_readlane_b32 s57, v41, 25 -; GISEL-NEXT: v_readlane_b32 s56, v41, 24 -; GISEL-NEXT: v_readlane_b32 s55, v41, 23 -; GISEL-NEXT: v_readlane_b32 s54, v41, 22 -; GISEL-NEXT: v_readlane_b32 s53, v41, 21 -; GISEL-NEXT: v_readlane_b32 s52, v41, 20 -; GISEL-NEXT: v_readlane_b32 s51, v41, 19 -; GISEL-NEXT: v_readlane_b32 s50, v41, 18 -; GISEL-NEXT: v_readlane_b32 s49, v41, 17 -; GISEL-NEXT: v_readlane_b32 s48, v41, 16 -; GISEL-NEXT: v_readlane_b32 s47, v41, 15 -; GISEL-NEXT: v_readlane_b32 s46, v41, 14 -; GISEL-NEXT: v_readlane_b32 s45, v41, 13 -; GISEL-NEXT: v_readlane_b32 s44, v41, 12 -; GISEL-NEXT: v_readlane_b32 s43, v41, 11 -; GISEL-NEXT: v_readlane_b32 s42, v41, 10 -; GISEL-NEXT: v_readlane_b32 s41, v41, 9 -; GISEL-NEXT: v_readlane_b32 s40, v41, 8 +; GISEL-NEXT: v_readlane_b32 s55, v41, 15 +; GISEL-NEXT: v_readlane_b32 s54, v41, 14 +; GISEL-NEXT: v_readlane_b32 s53, v41, 13 +; GISEL-NEXT: v_readlane_b32 s52, v41, 12 +; GISEL-NEXT: v_readlane_b32 s51, v41, 11 +; GISEL-NEXT: v_readlane_b32 s50, v41, 10 +; GISEL-NEXT: v_readlane_b32 s49, v41, 9 +; GISEL-NEXT: v_readlane_b32 s48, v41, 8 ; GISEL-NEXT: v_readlane_b32 s39, v41, 7 ; GISEL-NEXT: v_readlane_b32 s38, v41, 6 ; GISEL-NEXT: v_readlane_b32 s37, v41, 5 @@ -1257,30 +1129,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 @@ -1296,30 +1152,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -1353,30 +1193,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v1 @@ -1392,30 +1216,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v2 -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 @@ -1454,30 +1262,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 5 ; GCN-NEXT: v_writelane_b32 v40, s38, 6 ; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 8 +; GCN-NEXT: v_writelane_b32 v40, s49, 9 +; GCN-NEXT: v_writelane_b32 v40, s50, 10 +; GCN-NEXT: v_writelane_b32 v40, s51, 11 +; GCN-NEXT: v_writelane_b32 v40, s52, 12 +; GCN-NEXT: v_writelane_b32 v40, s53, 13 +; GCN-NEXT: v_writelane_b32 v40, s54, 14 +; GCN-NEXT: v_writelane_b32 v40, s55, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v0 @@ -1490,30 +1282,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB9_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s55, v40, 15 +; GCN-NEXT: v_readlane_b32 s54, v40, 14 +; GCN-NEXT: v_readlane_b32 s53, v40, 13 +; GCN-NEXT: v_readlane_b32 s52, v40, 12 +; GCN-NEXT: v_readlane_b32 s51, v40, 11 +; GCN-NEXT: v_readlane_b32 s50, v40, 10 +; GCN-NEXT: v_readlane_b32 s49, v40, 9 +; GCN-NEXT: v_readlane_b32 s48, v40, 8 ; GCN-NEXT: v_readlane_b32 s39, v40, 7 ; GCN-NEXT: v_readlane_b32 s38, v40, 6 ; GCN-NEXT: v_readlane_b32 s37, v40, 5 @@ -1547,30 +1323,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 5 ; GISEL-NEXT: v_writelane_b32 v40, s38, 6 ; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 8 +; GISEL-NEXT: v_writelane_b32 v40, s49, 9 +; GISEL-NEXT: v_writelane_b32 v40, s50, 10 +; GISEL-NEXT: v_writelane_b32 v40, s51, 11 +; GISEL-NEXT: v_writelane_b32 v40, s52, 12 +; GISEL-NEXT: v_writelane_b32 v40, s53, 13 +; GISEL-NEXT: v_writelane_b32 v40, s54, 14 +; GISEL-NEXT: v_writelane_b32 v40, s55, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v0 @@ -1583,30 +1343,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB9_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: v_readlane_b32 s55, v40, 15 +; GISEL-NEXT: v_readlane_b32 s54, v40, 14 +; GISEL-NEXT: v_readlane_b32 s53, v40, 13 +; GISEL-NEXT: v_readlane_b32 s52, v40, 12 +; GISEL-NEXT: v_readlane_b32 s51, v40, 11 +; GISEL-NEXT: v_readlane_b32 s50, v40, 10 +; GISEL-NEXT: v_readlane_b32 s49, v40, 9 +; GISEL-NEXT: v_readlane_b32 s48, v40, 8 ; GISEL-NEXT: v_readlane_b32 s39, v40, 7 ; GISEL-NEXT: v_readlane_b32 s38, v40, 6 ; GISEL-NEXT: v_readlane_b32 s37, v40, 5 diff --git a/llvm/test/CodeGen/AMDGPU/issue48473.mir b/llvm/test/CodeGen/AMDGPU/issue48473.mir index dd73e65de7cb6..654461a62fa9f 100644 --- a/llvm/test/CodeGen/AMDGPU/issue48473.mir +++ b/llvm/test/CodeGen/AMDGPU/issue48473.mir @@ -43,7 +43,7 @@ # %25 to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 # CHECK-LABEL: name: issue48473 -# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +# CHECK: S_NOP 0, implicit killed renamable $sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, implicit killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, implicit killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 --- name: issue48473 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index a3bd0aabd5c3f..55fa02a0c582c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -815,50 +815,27 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s40, 4 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s41, 5 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s42, 6 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s43, 7 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s44, 8 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s45, 9 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s46, 10 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s47, 11 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 12 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 13 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 14 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 15 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 16 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 17 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 18 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 19 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s56, 20 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s57, 21 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s58, 22 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s59, 23 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s60, 24 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s61, 25 ; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s62, 26 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s63, 27 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 28 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 29 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 30 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 31 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 @@ -867,6 +844,17 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 @@ -875,10 +863,6 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) @@ -888,34 +872,18 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[36:67] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 31 -; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 30 -; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 29 -; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 28 -; CHECK-SDAG-NEXT: v_readlane_b32 s63, v31, 27 -; CHECK-SDAG-NEXT: v_readlane_b32 s62, v31, 26 -; CHECK-SDAG-NEXT: v_readlane_b32 s61, v31, 25 -; CHECK-SDAG-NEXT: v_readlane_b32 s60, v31, 24 -; CHECK-SDAG-NEXT: v_readlane_b32 s59, v31, 23 -; CHECK-SDAG-NEXT: v_readlane_b32 s58, v31, 22 -; CHECK-SDAG-NEXT: v_readlane_b32 s57, v31, 21 -; CHECK-SDAG-NEXT: v_readlane_b32 s56, v31, 20 -; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 19 -; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 18 -; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 17 -; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 16 -; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 15 -; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 14 -; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 13 -; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 12 -; CHECK-SDAG-NEXT: v_readlane_b32 s47, v31, 11 -; CHECK-SDAG-NEXT: v_readlane_b32 s46, v31, 10 -; CHECK-SDAG-NEXT: v_readlane_b32 s45, v31, 9 -; CHECK-SDAG-NEXT: v_readlane_b32 s44, v31, 8 -; CHECK-SDAG-NEXT: v_readlane_b32 s43, v31, 7 -; CHECK-SDAG-NEXT: v_readlane_b32 s42, v31, 6 -; CHECK-SDAG-NEXT: v_readlane_b32 s41, v31, 5 -; CHECK-SDAG-NEXT: v_readlane_b32 s40, v31, 4 +; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 ; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 ; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 ; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 @@ -940,45 +908,21 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s40, 4 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s41, 5 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s42, 6 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s43, 7 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s44, 8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s45, 9 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s46, 10 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s47, 11 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 12 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 13 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 14 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 15 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 16 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 17 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 18 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 19 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s56, 20 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s57, 21 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s58, 22 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s59, 23 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s60, 24 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s61, 25 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s62, 26 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s63, 27 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 28 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 29 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 30 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 31 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 @@ -987,6 +931,15 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 @@ -995,7 +948,6 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1005,34 +957,18 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[36:67] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 31 -; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 30 -; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 29 -; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 28 -; CHECK-GISEL-NEXT: v_readlane_b32 s63, v31, 27 -; CHECK-GISEL-NEXT: v_readlane_b32 s62, v31, 26 -; CHECK-GISEL-NEXT: v_readlane_b32 s61, v31, 25 -; CHECK-GISEL-NEXT: v_readlane_b32 s60, v31, 24 -; CHECK-GISEL-NEXT: v_readlane_b32 s59, v31, 23 -; CHECK-GISEL-NEXT: v_readlane_b32 s58, v31, 22 -; CHECK-GISEL-NEXT: v_readlane_b32 s57, v31, 21 -; CHECK-GISEL-NEXT: v_readlane_b32 s56, v31, 20 -; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 19 -; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 18 -; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 17 -; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 16 -; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 15 -; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 14 -; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 13 -; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 12 -; CHECK-GISEL-NEXT: v_readlane_b32 s47, v31, 11 -; CHECK-GISEL-NEXT: v_readlane_b32 s46, v31, 10 -; CHECK-GISEL-NEXT: v_readlane_b32 s45, v31, 9 -; CHECK-GISEL-NEXT: v_readlane_b32 s44, v31, 8 -; CHECK-GISEL-NEXT: v_readlane_b32 s43, v31, 7 -; CHECK-GISEL-NEXT: v_readlane_b32 s42, v31, 6 -; CHECK-GISEL-NEXT: v_readlane_b32 s41, v31, 5 -; CHECK-GISEL-NEXT: v_readlane_b32 s40, v31, 4 +; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 ; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 ; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 @@ -1324,50 +1260,27 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s40, 4 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s41, 5 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s42, 6 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s43, 7 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s44, 8 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s45, 9 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s46, 10 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s47, 11 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 12 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 13 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 14 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 15 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 16 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 17 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 18 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 19 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s56, 20 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s57, 21 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s58, 22 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s59, 23 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s60, 24 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s61, 25 ; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s62, 26 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s63, 27 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 28 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 29 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 30 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 31 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 @@ -1376,6 +1289,17 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 @@ -1384,10 +1308,6 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) @@ -1397,34 +1317,18 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[36:67] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 31 -; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 30 -; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 29 -; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 28 -; CHECK-SDAG-NEXT: v_readlane_b32 s63, v31, 27 -; CHECK-SDAG-NEXT: v_readlane_b32 s62, v31, 26 -; CHECK-SDAG-NEXT: v_readlane_b32 s61, v31, 25 -; CHECK-SDAG-NEXT: v_readlane_b32 s60, v31, 24 -; CHECK-SDAG-NEXT: v_readlane_b32 s59, v31, 23 -; CHECK-SDAG-NEXT: v_readlane_b32 s58, v31, 22 -; CHECK-SDAG-NEXT: v_readlane_b32 s57, v31, 21 -; CHECK-SDAG-NEXT: v_readlane_b32 s56, v31, 20 -; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 19 -; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 18 -; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 17 -; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 16 -; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 15 -; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 14 -; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 13 -; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 12 -; CHECK-SDAG-NEXT: v_readlane_b32 s47, v31, 11 -; CHECK-SDAG-NEXT: v_readlane_b32 s46, v31, 10 -; CHECK-SDAG-NEXT: v_readlane_b32 s45, v31, 9 -; CHECK-SDAG-NEXT: v_readlane_b32 s44, v31, 8 -; CHECK-SDAG-NEXT: v_readlane_b32 s43, v31, 7 -; CHECK-SDAG-NEXT: v_readlane_b32 s42, v31, 6 -; CHECK-SDAG-NEXT: v_readlane_b32 s41, v31, 5 -; CHECK-SDAG-NEXT: v_readlane_b32 s40, v31, 4 +; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 ; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 ; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 ; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 @@ -1449,45 +1353,21 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s40, 4 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s41, 5 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s42, 6 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s43, 7 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s44, 8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s45, 9 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s46, 10 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s47, 11 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 12 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 13 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 14 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 15 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 16 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 17 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 18 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 19 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s56, 20 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s57, 21 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s58, 22 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s59, 23 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s60, 24 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s61, 25 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s62, 26 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s63, 27 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 28 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 29 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 30 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 31 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 @@ -1496,6 +1376,15 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 @@ -1504,7 +1393,6 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1514,34 +1402,18 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[36:67] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 31 -; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 30 -; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 29 -; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 28 -; CHECK-GISEL-NEXT: v_readlane_b32 s63, v31, 27 -; CHECK-GISEL-NEXT: v_readlane_b32 s62, v31, 26 -; CHECK-GISEL-NEXT: v_readlane_b32 s61, v31, 25 -; CHECK-GISEL-NEXT: v_readlane_b32 s60, v31, 24 -; CHECK-GISEL-NEXT: v_readlane_b32 s59, v31, 23 -; CHECK-GISEL-NEXT: v_readlane_b32 s58, v31, 22 -; CHECK-GISEL-NEXT: v_readlane_b32 s57, v31, 21 -; CHECK-GISEL-NEXT: v_readlane_b32 s56, v31, 20 -; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 19 -; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 18 -; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 17 -; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 16 -; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 15 -; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 14 -; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 13 -; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 12 -; CHECK-GISEL-NEXT: v_readlane_b32 s47, v31, 11 -; CHECK-GISEL-NEXT: v_readlane_b32 s46, v31, 10 -; CHECK-GISEL-NEXT: v_readlane_b32 s45, v31, 9 -; CHECK-GISEL-NEXT: v_readlane_b32 s44, v31, 8 -; CHECK-GISEL-NEXT: v_readlane_b32 s43, v31, 7 -; CHECK-GISEL-NEXT: v_readlane_b32 s42, v31, 6 -; CHECK-GISEL-NEXT: v_readlane_b32 s41, v31, 5 -; CHECK-GISEL-NEXT: v_readlane_b32 s40, v31, 4 +; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 ; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 ; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 584dd2700c419..8b1ba393c8de8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -1727,14 +1727,9 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_maximum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1743,7 +1738,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX7-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1765,7 +1760,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1779,29 +1774,18 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1810,7 +1794,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX8-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1832,7 +1816,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1846,29 +1830,18 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1877,7 +1850,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX900-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1899,7 +1872,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1913,16 +1886,10 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX900-NEXT: v_readlane_b32 s31, v31, 1 -; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index e354ec6fb3dd7..3344c73f9eb6f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2008,15 +2008,8 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_maximum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_writelane_b32 v34, s30, 0 -; GFX7-NEXT: v_writelane_b32 v34, s31, 1 -; GFX7-NEXT: v_writelane_b32 v34, s34, 2 -; GFX7-NEXT: v_writelane_b32 v34, s35, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] @@ -2102,14 +2095,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX7-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX7-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2126,31 +2119,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX7-NEXT: v_readlane_b32 s35, v34, 3 -; GFX7-NEXT: v_readlane_b32 s34, v34, 2 -; GFX7-NEXT: v_readlane_b32 s31, v34, 1 -; GFX7-NEXT: v_readlane_b32 s30, v34, 0 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX8-NEXT: v_writelane_b32 v34, s30, 0 -; GFX8-NEXT: v_writelane_b32 v34, s31, 1 -; GFX8-NEXT: v_writelane_b32 v34, s34, 2 -; GFX8-NEXT: v_writelane_b32 v34, s35, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] @@ -2236,14 +2214,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX8-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX8-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2260,31 +2238,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX8-NEXT: v_readlane_b32 s35, v34, 3 -; GFX8-NEXT: v_readlane_b32 s34, v34, 2 -; GFX8-NEXT: v_readlane_b32 s31, v34, 1 -; GFX8-NEXT: v_readlane_b32 s30, v34, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX900-NEXT: v_writelane_b32 v34, s30, 0 -; GFX900-NEXT: v_writelane_b32 v34, s31, 1 -; GFX900-NEXT: v_writelane_b32 v34, s34, 2 -; GFX900-NEXT: v_writelane_b32 v34, s35, 3 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX900-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] @@ -2370,14 +2333,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX900-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX900-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2394,17 +2357,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX900-NEXT: v_readlane_b32 s35, v34, 3 -; GFX900-NEXT: v_readlane_b32 s34, v34, 2 -; GFX900-NEXT: v_readlane_b32 s31, v34, 1 -; GFX900-NEXT: v_readlane_b32 s30, v34, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 9962433134073..7b2998cbd242f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -1727,14 +1727,9 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_minimum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1743,7 +1738,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX7-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1765,7 +1760,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1779,29 +1774,18 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1810,7 +1794,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX8-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1832,7 +1816,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1846,29 +1830,18 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 @@ -1877,7 +1850,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX900-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1899,7 +1872,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1913,16 +1886,10 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX900-NEXT: v_readlane_b32 s31, v31, 1 -; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 71fdd691a1512..1d1673315f6ff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2008,15 +2008,8 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_minimum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_writelane_b32 v34, s30, 0 -; GFX7-NEXT: v_writelane_b32 v34, s31, 1 -; GFX7-NEXT: v_writelane_b32 v34, s34, 2 -; GFX7-NEXT: v_writelane_b32 v34, s35, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] @@ -2102,14 +2095,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX7-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX7-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2126,31 +2119,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX7-NEXT: v_readlane_b32 s35, v34, 3 -; GFX7-NEXT: v_readlane_b32 s34, v34, 2 -; GFX7-NEXT: v_readlane_b32 s31, v34, 1 -; GFX7-NEXT: v_readlane_b32 s30, v34, 0 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX8-NEXT: v_writelane_b32 v34, s30, 0 -; GFX8-NEXT: v_writelane_b32 v34, s31, 1 -; GFX8-NEXT: v_writelane_b32 v34, s34, 2 -; GFX8-NEXT: v_writelane_b32 v34, s35, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] @@ -2236,14 +2214,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX8-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX8-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2260,31 +2238,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX8-NEXT: v_readlane_b32 s35, v34, 3 -; GFX8-NEXT: v_readlane_b32 s34, v34, 2 -; GFX8-NEXT: v_readlane_b32 s31, v34, 1 -; GFX8-NEXT: v_readlane_b32 s30, v34, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX900-NEXT: v_writelane_b32 v34, s30, 0 -; GFX900-NEXT: v_writelane_b32 v34, s31, 1 -; GFX900-NEXT: v_writelane_b32 v34, s34, 2 -; GFX900-NEXT: v_writelane_b32 v34, s35, 3 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] ; GFX900-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] @@ -2370,14 +2333,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX900-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX900-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc @@ -2394,17 +2357,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX900-NEXT: v_readlane_b32 s35, v34, 3 -; GFX900-NEXT: v_readlane_b32 s34, v34, 2 -; GFX900-NEXT: v_readlane_b32 s31, v34, 1 -; GFX900-NEXT: v_readlane_b32 s30, v34, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index acb706cee04d0..4fb6a0114b499 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -43,28 +43,28 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[64:71], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s44, s34, 40 +; CHECK-NEXT: s_add_u32 s52, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s33, s16 -; CHECK-NEXT: s_addc_u32 s45, s35, 0 -; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_addc_u32 s53, s35, 0 +; CHECK-NEXT: s_mov_b32 s51, s14 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v45, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v43, v0 @@ -73,12 +73,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 @@ -87,12 +87,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -102,22 +102,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: global_load_dword v0, v0, s[52:53] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: global_load_dword v0, v0, s[68:69] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 ; CHECK-NEXT: v_mov_b32_e32 v1, 12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: s_mov_b32 s44, exec_lo +; CHECK-NEXT: s_mov_b32 s52, exec_lo ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 @@ -136,7 +136,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s45, 0 +; CHECK-NEXT: s_mov_b32 s53, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 @@ -144,46 +144,46 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s47, 0 +; CHECK-NEXT: s_mov_b32 s55, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s47, v44 -; CHECK-NEXT: s_lshl_b32 s4, s47, 5 -; CHECK-NEXT: s_add_i32 s46, s47, 1 -; CHECK-NEXT: s_add_i32 s5, s47, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s46 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 +; CHECK-NEXT: s_lshl_b32 s4, s55, 5 +; CHECK-NEXT: s_add_i32 s54, s55, 1 +; CHECK-NEXT: s_add_i32 s5, s55, 5 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s46 -; CHECK-NEXT: s_mov_b32 s52, exec_lo +; CHECK-NEXT: v_mov_b32_e32 v58, s54 +; CHECK-NEXT: s_mov_b32 s68, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: s_mov_b32 s56, 0 +; CHECK-NEXT: s_mov_b32 s69, 0 +; CHECK-NEXT: s_mov_b32 s80, 0 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 -; CHECK-NEXT: s_add_i32 s56, s56, 4 -; CHECK-NEXT: s_add_i32 s4, s47, s56 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s56, v57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 +; CHECK-NEXT: s_add_i32 s80, s80, 4 +; CHECK-NEXT: s_add_i32 s4, s55, s80 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s56, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s56, v57 +; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57 ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -193,22 +193,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -218,11 +218,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -230,11 +230,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -244,11 +244,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -256,11 +256,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -270,11 +270,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -284,27 +284,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_16: ; %Flow45 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: v_mov_b32_e32 v57, v0 ; CHECK-NEXT: .LBB0_17: ; %Flow46 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s47, exec_lo +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 +; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 ; CHECK-NEXT: ; %bb.18: ; %.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_20 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 -; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: s_cbranch_execz .LBB0_22 ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 @@ -312,7 +312,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s53, s4 +; CHECK-NEXT: s_and_saveexec_b32 s69, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_19 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -322,11 +322,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -336,22 +336,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_22: ; %Flow43 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: .LBB0_23: ; %Flow44 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s46, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s47, s46 +; CHECK-NEXT: s_mov_b32 s55, s54 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s45, s4, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 s53, s4, s53 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 ; CHECK-NEXT: .LBB0_25: ; %Flow51 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -359,11 +359,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 @@ -373,10 +373,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: ; %bb.26: -; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_branch .LBB0_28 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -384,21 +384,21 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 -; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 -; CHECK-NEXT: s_mov_b32 s45, exec_lo +; CHECK-NEXT: s_mov_b32 s53, exec_lo ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0 @@ -407,8 +407,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 -; CHECK-NEXT: v_add_co_u32 v2, s4, s48, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s49, 0, s4 +; CHECK-NEXT: v_add_co_u32 v2, s4, s64, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s65, 0, s4 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -445,8 +445,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73 @@ -454,11 +454,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 -; CHECK-NEXT: v_add_co_u32 v42, s4, s54, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s55, 0, s4 +; CHECK-NEXT: v_add_co_u32 v42, s4, s70, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s71, 0, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 @@ -469,7 +469,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] -; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51] +; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67] ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 @@ -503,11 +503,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_branch .LBB0_27 @@ -792,28 +792,28 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx2 s[46:47], s[8:9], 0x10 +; CHECK-NEXT: s_load_dwordx2 s[54:55], s[8:9], 0x10 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s44, s38, 40 +; CHECK-NEXT: s_add_u32 s52, s38, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s33, s16 -; CHECK-NEXT: s_addc_u32 s45, s39, 0 -; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_addc_u32 s53, s39, 0 +; CHECK-NEXT: s_mov_b32 s51, s14 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v43, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 @@ -822,12 +822,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 @@ -836,12 +836,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 @@ -852,15 +852,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: global_load_dword v0, v0, s[46:47] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: global_load_dword v0, v0, s[54:55] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 @@ -868,7 +868,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 -; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 @@ -878,12 +878,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s45, s4, 1 +; CHECK-NEXT: s_add_i32 s53, s4, 1 ; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s45 +; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s45 +; CHECK-NEXT: v_mov_b32_e32 v56, s53 ; CHECK-NEXT: s_mov_b32 s5, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 @@ -912,23 +912,23 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_5: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_mov_b32 s46, exec_lo +; CHECK-NEXT: s_mov_b32 s54, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_mov_b32 s47, 0 +; CHECK-NEXT: s_mov_b32 s55, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB1_8 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 -; CHECK-NEXT: s_or_b32 s47, vcc_lo, s47 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 @@ -937,7 +937,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s48, s4 +; CHECK-NEXT: s_and_saveexec_b32 s64, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 @@ -948,11 +948,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -962,22 +962,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: .LBB1_11: ; %Flow2 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s45, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s44, s4, s44 -; CHECK-NEXT: s_mov_b32 s4, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 s52, s4, s52 +; CHECK-NEXT: s_mov_b32 s4, s53 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s38, 40 @@ -985,11 +985,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index b4682dfb8a26d..4ca00f2daf97a 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -12,13 +12,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART @@ -30,24 +24,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART @@ -59,23 +41,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: s_addc_u32 s0, s32, 0x4040 @@ -89,12 +61,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: @@ -104,13 +70,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -124,50 +84,30 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_movk_i32 s59, 0x4040 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -175,47 +115,29 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_add_i32 s0, s32, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: s_and_b64 s[0:1], 0, exec ; GFX942-NEXT: s_addc_u32 s0, s32, 0x4040 ; GFX942-NEXT: s_bitcmp1_b32 s0, 0 ; GFX942-NEXT: s_bitset0_b32 s0, 0 -; GFX942-NEXT: v_writelane_b32 v1, s59, 0 -; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v1, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -230,12 +152,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_1-NEXT: s_addk_i32 s59, 0x4040 @@ -246,23 +162,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_3-NEXT: s_addk_i32 s59, 0x4040 @@ -273,22 +177,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -300,12 +193,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc: @@ -315,105 +202,62 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 -; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: s_addk_i32 s59, 0x4040 +; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_addk_i32 s59, 0x4040 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 -; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: s_addk_i32 s59, 0x4040 +; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_addk_i32 s59, 0x4040 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_add_i32 s0, s32, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX942-NEXT: v_writelane_b32 v1, s59, 0 -; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v1, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -428,14 +272,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s5, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -443,19 +281,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_1-NEXT: s_mov_b32 s33, s5 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 ; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_mov_b32 s33, s5 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -463,13 +294,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s5, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -477,18 +303,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_3-NEXT: s_mov_b32 s33, s5 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 ; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_mov_b32 s33, s5 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -496,13 +316,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0x4080 ; GFX11-NEXT: s_add_i32 s0, s33, 64 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: s_addc_u32 s0, s33, 0x4040 @@ -511,18 +327,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_mov_b32 s33, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -534,13 +343,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s33 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -554,14 +359,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_mov_b32 s33, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -570,33 +369,22 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: s_movk_i32 s59, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s33, s6 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -604,32 +392,21 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s6, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 ; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 +; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_mov_b32 s33, s6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -637,10 +414,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b32 s2, s33 ; GFX942-NEXT: s_mov_b32 s33, s32 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s3, s33, 0x4044 -; GFX942-NEXT: scratch_store_dword off, v1, s3 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_addk_i32 s32, 0x4080 ; GFX942-NEXT: s_add_i32 s0, s33, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 @@ -648,22 +421,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX942-NEXT: s_addc_u32 s0, s33, 0x4040 ; GFX942-NEXT: s_bitcmp1_b32 s0, 0 ; GFX942-NEXT: s_bitset0_b32 s0, 0 -; GFX942-NEXT: v_writelane_b32 v1, s59, 0 -; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v1, 0 ; GFX942-NEXT: s_mov_b32 s32, s33 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s3, s33, 0x4044 -; GFX942-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_mov_b32 s33, s2 -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -676,75 +442,39 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addc_u32 s0, s32, 64 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: @@ -754,94 +484,50 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 -; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_mov_b32 s59, 64 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s59, v1 -; GFX8-NEXT: v_readfirstlane_b32 s59, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v1 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_and_b64 s[0:1], 0, exec ; GFX942-NEXT: s_addc_u32 s0, s32, 64 ; GFX942-NEXT: s_bitcmp1_b32 s0, 0 ; GFX942-NEXT: s_bitset0_b32 s0, 0 -; GFX942-NEXT: v_writelane_b32 v0, s59, 0 ; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v0, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX942-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0) @@ -852,67 +538,32 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: @@ -922,85 +573,41 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 ; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: s_add_i32 s59, s59, 64 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 ; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: s_add_i32 s59, s59, 64 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_add_i32 s0, s32, 64 -; GFX942-NEXT: v_writelane_b32 v0, s59, 0 ; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v0, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX942-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0) @@ -1013,29 +620,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s5, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: s_mov_b32 s32, s33 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_1-NEXT: s_mov_b32 s33, s5 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_mov_b32 s33, s5 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1043,27 +637,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s5, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: s_mov_b32 s32, s33 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_3-NEXT: s_mov_b32 s33, s5 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_mov_b32 s33, s5 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1071,29 +654,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0x4080 +; GFX11-NEXT: s_addk_i32 s32, 0x4040 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addc_u32 s0, s33, 64 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_mov_b32 s33, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1105,25 +676,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 -; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_mov_b32 s33, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1132,28 +693,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: s_mov_b32 s59, 64 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s59, v1 -; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_readfirstlane_b32 s59, v1 +; GFX8-NEXT: s_add_i32 s32, s32, 0x101000 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s33, s6 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1161,27 +711,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s6, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 -; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v1 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX900-NEXT: s_add_i32 s32, s32, 0x101000 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_mov_b32 s33, s6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1189,28 +728,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b32 s2, s33 ; GFX942-NEXT: s_mov_b32 s33, s32 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s3, s33, 0x4040 -; GFX942-NEXT: scratch_store_dword off, v0, s3 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_addk_i32 s32, 0x4080 +; GFX942-NEXT: s_addk_i32 s32, 0x4040 ; GFX942-NEXT: s_and_b64 s[0:1], 0, exec ; GFX942-NEXT: s_addc_u32 s0, s33, 64 ; GFX942-NEXT: s_bitcmp1_b32 s0, 0 ; GFX942-NEXT: s_bitset0_b32 s0, 0 -; GFX942-NEXT: v_writelane_b32 v0, s59, 0 ; GFX942-NEXT: s_mov_b32 s59, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v0, 0 ; GFX942-NEXT: s_mov_b32 s32, s33 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s3, s33, 0x4040 -; GFX942-NEXT: scratch_load_dword v0, off, s3 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_mov_b32 s33, s2 -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0) @@ -1223,27 +751,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s4, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_1-NEXT: s_lshr_b32 s59, s33, 5 ; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 ; GFX10_1-NEXT: s_mov_b32 s33, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1251,25 +766,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s4, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 +; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_3-NEXT: s_lshr_b32 s59, s33, 5 ; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 ; GFX10_3-NEXT: s_mov_b32 s33, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1277,25 +781,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 -; GFX11-NEXT: s_addk_i32 s32, 0x4080 +; GFX11-NEXT: s_addk_i32 s32, 0x4040 ; GFX11-NEXT: s_add_i32 s1, s33, 64 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1307,24 +800,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_mov_b32 s33, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1333,25 +816,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 +; GFX8-NEXT: s_add_i32 s32, s32, 0x101000 ; GFX8-NEXT: s_lshr_b32 s59, s33, 6 ; GFX8-NEXT: s_add_i32 s59, s59, 64 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b32 s33, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1359,25 +831,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s4, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[6:7] -; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 +; GFX900-NEXT: s_add_i32 s32, s32, 0x101000 ; GFX900-NEXT: s_lshr_b32 s59, s33, 6 ; GFX900-NEXT: s_add_i32 s59, s59, 64 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[6:7] ; GFX900-NEXT: s_mov_b32 s33, s4 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1385,25 +846,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b32 s0, s33 ; GFX942-NEXT: s_mov_b32 s33, s32 -; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX942-NEXT: s_add_i32 s1, s33, 0x4040 -; GFX942-NEXT: scratch_store_dword off, v0, s1 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[2:3] -; GFX942-NEXT: s_addk_i32 s32, 0x4080 +; GFX942-NEXT: s_addk_i32 s32, 0x4040 ; GFX942-NEXT: s_add_i32 s1, s33, 64 -; GFX942-NEXT: v_writelane_b32 v0, s59, 0 ; GFX942-NEXT: s_mov_b32 s59, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v0, 0 ; GFX942-NEXT: s_mov_b32 s32, s33 -; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX942-NEXT: s_add_i32 s1, s33, 0x4040 -; GFX942-NEXT: scratch_load_dword v0, off, s1 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b32 s33, s0 -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0) @@ -1414,12 +864,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_1-NEXT: s_add_i32 s59, s4, 0x442c @@ -1431,23 +875,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_3-NEXT: s_add_i32 s59, s4, 0x442c @@ -1459,22 +891,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_add_i32 s59, s32, 0x442c ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -1485,12 +906,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: @@ -1500,11 +915,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_i32 s59, s32, 0x43ec ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo @@ -1514,23 +924,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_lshr_b32 s4, s32, 6 -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: s_add_i32 s59, s4, 0x442c ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 @@ -1541,23 +941,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_lshr_b32 s4, s32, 6 -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: s_add_i32 s59, s4, 0x442c ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 @@ -1568,22 +957,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: v_writelane_b32 v1, s59, 0 ; GFX942-NEXT: s_add_i32 s59, s32, 0x442c ; GFX942-NEXT: s_add_i32 s0, s32, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 @@ -1594,12 +972,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v1, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) @@ -1613,12 +985,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 @@ -1632,23 +998,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 @@ -1662,23 +1016,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s1, s32, 64 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_add_i32 s59, s32, s0 @@ -1690,12 +1033,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: @@ -1705,11 +1042,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1723,22 +1055,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: s_lshl_b32 s4, s16, 2 ; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: s_add_i32 s59, s59, s4 @@ -1752,22 +1074,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: s_lshl_b32 s4, s16, 2 ; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: s_add_i32 s59, s59, s4 @@ -1781,23 +1092,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX942-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill -; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_lshl_b32 s0, s0, 2 -; GFX942-NEXT: v_writelane_b32 v1, s59, 0 ; GFX942-NEXT: s_add_i32 s59, s32, s0 ; GFX942-NEXT: s_addk_i32 s59, 0x4040 ; GFX942-NEXT: s_add_i32 s0, s32, 64 @@ -1809,12 +1109,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s59, v1, 0 -; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 2420393b63ba9..e8dacc93a8f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -46,28 +46,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: v_writelane_b32 v23, s37, 6 ; GFX7-NEXT: v_writelane_b32 v23, s38, 7 ; GFX7-NEXT: v_writelane_b32 v23, s39, 8 -; GFX7-NEXT: v_writelane_b32 v23, s40, 9 -; GFX7-NEXT: v_writelane_b32 v23, s41, 10 -; GFX7-NEXT: v_writelane_b32 v23, s42, 11 -; GFX7-NEXT: v_writelane_b32 v23, s43, 12 -; GFX7-NEXT: v_writelane_b32 v23, s44, 13 -; GFX7-NEXT: v_writelane_b32 v23, s45, 14 -; GFX7-NEXT: v_writelane_b32 v23, s46, 15 -; GFX7-NEXT: v_writelane_b32 v23, s47, 16 -; GFX7-NEXT: v_writelane_b32 v23, s48, 17 -; GFX7-NEXT: v_writelane_b32 v23, s49, 18 -; GFX7-NEXT: v_writelane_b32 v23, s50, 19 -; GFX7-NEXT: v_writelane_b32 v23, s51, 20 -; GFX7-NEXT: v_writelane_b32 v23, s52, 21 -; GFX7-NEXT: v_writelane_b32 v23, s53, 22 -; GFX7-NEXT: v_writelane_b32 v23, s54, 23 -; GFX7-NEXT: v_writelane_b32 v23, s55, 24 -; GFX7-NEXT: v_writelane_b32 v23, s56, 25 +; GFX7-NEXT: v_writelane_b32 v23, s48, 9 +; GFX7-NEXT: v_writelane_b32 v23, s49, 10 +; GFX7-NEXT: v_writelane_b32 v23, s50, 11 +; GFX7-NEXT: v_writelane_b32 v23, s51, 12 +; GFX7-NEXT: v_writelane_b32 v23, s52, 13 +; GFX7-NEXT: v_writelane_b32 v23, s53, 14 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 -; GFX7-NEXT: v_writelane_b32 v23, s57, 26 +; GFX7-NEXT: v_writelane_b32 v23, s54, 15 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v23, s58, 27 +; GFX7-NEXT: v_writelane_b32 v23, s55, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use alloca0 v0 ; GFX7-NEXT: ;;#ASMEND @@ -78,33 +67,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x4040 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, 64, s32 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 6, v0 -; GFX7-NEXT: v_writelane_b32 v23, s59, 28 ; GFX7-NEXT: v_readfirstlane_b32 s59, v0 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s59, v23, 28 -; GFX7-NEXT: v_readlane_b32 s58, v23, 27 -; GFX7-NEXT: v_readlane_b32 s57, v23, 26 -; GFX7-NEXT: v_readlane_b32 s56, v23, 25 -; GFX7-NEXT: v_readlane_b32 s55, v23, 24 -; GFX7-NEXT: v_readlane_b32 s54, v23, 23 -; GFX7-NEXT: v_readlane_b32 s53, v23, 22 -; GFX7-NEXT: v_readlane_b32 s52, v23, 21 -; GFX7-NEXT: v_readlane_b32 s51, v23, 20 -; GFX7-NEXT: v_readlane_b32 s50, v23, 19 -; GFX7-NEXT: v_readlane_b32 s49, v23, 18 -; GFX7-NEXT: v_readlane_b32 s48, v23, 17 -; GFX7-NEXT: v_readlane_b32 s47, v23, 16 -; GFX7-NEXT: v_readlane_b32 s46, v23, 15 -; GFX7-NEXT: v_readlane_b32 s45, v23, 14 -; GFX7-NEXT: v_readlane_b32 s44, v23, 13 -; GFX7-NEXT: v_readlane_b32 s43, v23, 12 -; GFX7-NEXT: v_readlane_b32 s42, v23, 11 -; GFX7-NEXT: v_readlane_b32 s41, v23, 10 -; GFX7-NEXT: v_readlane_b32 s40, v23, 9 +; GFX7-NEXT: v_readlane_b32 s55, v23, 16 +; GFX7-NEXT: v_readlane_b32 s54, v23, 15 +; GFX7-NEXT: v_readlane_b32 s53, v23, 14 +; GFX7-NEXT: v_readlane_b32 s52, v23, 13 +; GFX7-NEXT: v_readlane_b32 s51, v23, 12 +; GFX7-NEXT: v_readlane_b32 s50, v23, 11 +; GFX7-NEXT: v_readlane_b32 s49, v23, 10 +; GFX7-NEXT: v_readlane_b32 s48, v23, 9 ; GFX7-NEXT: v_readlane_b32 s39, v23, 8 ; GFX7-NEXT: v_readlane_b32 s38, v23, 7 ; GFX7-NEXT: v_readlane_b32 s37, v23, 6 @@ -137,28 +113,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: v_writelane_b32 v23, s37, 6 ; GFX8-NEXT: v_writelane_b32 v23, s38, 7 ; GFX8-NEXT: v_writelane_b32 v23, s39, 8 -; GFX8-NEXT: v_writelane_b32 v23, s40, 9 -; GFX8-NEXT: v_writelane_b32 v23, s41, 10 -; GFX8-NEXT: v_writelane_b32 v23, s42, 11 -; GFX8-NEXT: v_writelane_b32 v23, s43, 12 -; GFX8-NEXT: v_writelane_b32 v23, s44, 13 -; GFX8-NEXT: v_writelane_b32 v23, s45, 14 -; GFX8-NEXT: v_writelane_b32 v23, s46, 15 -; GFX8-NEXT: v_writelane_b32 v23, s47, 16 -; GFX8-NEXT: v_writelane_b32 v23, s48, 17 -; GFX8-NEXT: v_writelane_b32 v23, s49, 18 -; GFX8-NEXT: v_writelane_b32 v23, s50, 19 -; GFX8-NEXT: v_writelane_b32 v23, s51, 20 -; GFX8-NEXT: v_writelane_b32 v23, s52, 21 -; GFX8-NEXT: v_writelane_b32 v23, s53, 22 -; GFX8-NEXT: v_writelane_b32 v23, s54, 23 -; GFX8-NEXT: v_writelane_b32 v23, s55, 24 -; GFX8-NEXT: v_writelane_b32 v23, s56, 25 +; GFX8-NEXT: v_writelane_b32 v23, s48, 9 +; GFX8-NEXT: v_writelane_b32 v23, s49, 10 +; GFX8-NEXT: v_writelane_b32 v23, s50, 11 +; GFX8-NEXT: v_writelane_b32 v23, s51, 12 +; GFX8-NEXT: v_writelane_b32 v23, s52, 13 +; GFX8-NEXT: v_writelane_b32 v23, s53, 14 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: v_writelane_b32 v23, s57, 26 +; GFX8-NEXT: v_writelane_b32 v23, s54, 15 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec -; GFX8-NEXT: v_writelane_b32 v23, s58, 27 +; GFX8-NEXT: v_writelane_b32 v23, s55, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND @@ -169,33 +134,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x4040 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 64, s32 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 6, v0 -; GFX8-NEXT: v_writelane_b32 v23, s59, 28 ; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v23, 28 -; GFX8-NEXT: v_readlane_b32 s58, v23, 27 -; GFX8-NEXT: v_readlane_b32 s57, v23, 26 -; GFX8-NEXT: v_readlane_b32 s56, v23, 25 -; GFX8-NEXT: v_readlane_b32 s55, v23, 24 -; GFX8-NEXT: v_readlane_b32 s54, v23, 23 -; GFX8-NEXT: v_readlane_b32 s53, v23, 22 -; GFX8-NEXT: v_readlane_b32 s52, v23, 21 -; GFX8-NEXT: v_readlane_b32 s51, v23, 20 -; GFX8-NEXT: v_readlane_b32 s50, v23, 19 -; GFX8-NEXT: v_readlane_b32 s49, v23, 18 -; GFX8-NEXT: v_readlane_b32 s48, v23, 17 -; GFX8-NEXT: v_readlane_b32 s47, v23, 16 -; GFX8-NEXT: v_readlane_b32 s46, v23, 15 -; GFX8-NEXT: v_readlane_b32 s45, v23, 14 -; GFX8-NEXT: v_readlane_b32 s44, v23, 13 -; GFX8-NEXT: v_readlane_b32 s43, v23, 12 -; GFX8-NEXT: v_readlane_b32 s42, v23, 11 -; GFX8-NEXT: v_readlane_b32 s41, v23, 10 -; GFX8-NEXT: v_readlane_b32 s40, v23, 9 +; GFX8-NEXT: v_readlane_b32 s55, v23, 16 +; GFX8-NEXT: v_readlane_b32 s54, v23, 15 +; GFX8-NEXT: v_readlane_b32 s53, v23, 14 +; GFX8-NEXT: v_readlane_b32 s52, v23, 13 +; GFX8-NEXT: v_readlane_b32 s51, v23, 12 +; GFX8-NEXT: v_readlane_b32 s50, v23, 11 +; GFX8-NEXT: v_readlane_b32 s49, v23, 10 +; GFX8-NEXT: v_readlane_b32 s48, v23, 9 ; GFX8-NEXT: v_readlane_b32 s39, v23, 8 ; GFX8-NEXT: v_readlane_b32 s38, v23, 7 ; GFX8-NEXT: v_readlane_b32 s37, v23, 6 @@ -228,28 +180,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: v_writelane_b32 v23, s37, 6 ; GFX900-NEXT: v_writelane_b32 v23, s38, 7 ; GFX900-NEXT: v_writelane_b32 v23, s39, 8 -; GFX900-NEXT: v_writelane_b32 v23, s40, 9 -; GFX900-NEXT: v_writelane_b32 v23, s41, 10 -; GFX900-NEXT: v_writelane_b32 v23, s42, 11 -; GFX900-NEXT: v_writelane_b32 v23, s43, 12 -; GFX900-NEXT: v_writelane_b32 v23, s44, 13 -; GFX900-NEXT: v_writelane_b32 v23, s45, 14 -; GFX900-NEXT: v_writelane_b32 v23, s46, 15 -; GFX900-NEXT: v_writelane_b32 v23, s47, 16 -; GFX900-NEXT: v_writelane_b32 v23, s48, 17 -; GFX900-NEXT: v_writelane_b32 v23, s49, 18 -; GFX900-NEXT: v_writelane_b32 v23, s50, 19 -; GFX900-NEXT: v_writelane_b32 v23, s51, 20 -; GFX900-NEXT: v_writelane_b32 v23, s52, 21 -; GFX900-NEXT: v_writelane_b32 v23, s53, 22 -; GFX900-NEXT: v_writelane_b32 v23, s54, 23 -; GFX900-NEXT: v_writelane_b32 v23, s55, 24 -; GFX900-NEXT: v_writelane_b32 v23, s56, 25 +; GFX900-NEXT: v_writelane_b32 v23, s48, 9 +; GFX900-NEXT: v_writelane_b32 v23, s49, 10 +; GFX900-NEXT: v_writelane_b32 v23, s50, 11 +; GFX900-NEXT: v_writelane_b32 v23, s51, 12 +; GFX900-NEXT: v_writelane_b32 v23, s52, 13 +; GFX900-NEXT: v_writelane_b32 v23, s53, 14 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: v_writelane_b32 v23, s57, 26 +; GFX900-NEXT: v_writelane_b32 v23, s54, 15 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec -; GFX900-NEXT: v_writelane_b32 v23, s58, 27 +; GFX900-NEXT: v_writelane_b32 v23, s55, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND @@ -259,33 +200,20 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_writelane_b32 v23, s59, 28 ; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v23, 28 -; GFX900-NEXT: v_readlane_b32 s58, v23, 27 -; GFX900-NEXT: v_readlane_b32 s57, v23, 26 -; GFX900-NEXT: v_readlane_b32 s56, v23, 25 -; GFX900-NEXT: v_readlane_b32 s55, v23, 24 -; GFX900-NEXT: v_readlane_b32 s54, v23, 23 -; GFX900-NEXT: v_readlane_b32 s53, v23, 22 -; GFX900-NEXT: v_readlane_b32 s52, v23, 21 -; GFX900-NEXT: v_readlane_b32 s51, v23, 20 -; GFX900-NEXT: v_readlane_b32 s50, v23, 19 -; GFX900-NEXT: v_readlane_b32 s49, v23, 18 -; GFX900-NEXT: v_readlane_b32 s48, v23, 17 -; GFX900-NEXT: v_readlane_b32 s47, v23, 16 -; GFX900-NEXT: v_readlane_b32 s46, v23, 15 -; GFX900-NEXT: v_readlane_b32 s45, v23, 14 -; GFX900-NEXT: v_readlane_b32 s44, v23, 13 -; GFX900-NEXT: v_readlane_b32 s43, v23, 12 -; GFX900-NEXT: v_readlane_b32 s42, v23, 11 -; GFX900-NEXT: v_readlane_b32 s41, v23, 10 -; GFX900-NEXT: v_readlane_b32 s40, v23, 9 +; GFX900-NEXT: v_readlane_b32 s55, v23, 16 +; GFX900-NEXT: v_readlane_b32 s54, v23, 15 +; GFX900-NEXT: v_readlane_b32 s53, v23, 14 +; GFX900-NEXT: v_readlane_b32 s52, v23, 13 +; GFX900-NEXT: v_readlane_b32 s51, v23, 12 +; GFX900-NEXT: v_readlane_b32 s50, v23, 11 +; GFX900-NEXT: v_readlane_b32 s49, v23, 10 +; GFX900-NEXT: v_readlane_b32 s48, v23, 9 ; GFX900-NEXT: v_readlane_b32 s39, v23, 8 ; GFX900-NEXT: v_readlane_b32 s38, v23, 7 ; GFX900-NEXT: v_readlane_b32 s37, v23, 6 @@ -318,31 +246,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX942-NEXT: v_writelane_b32 v23, s37, 6 ; GFX942-NEXT: v_writelane_b32 v23, s38, 7 ; GFX942-NEXT: v_writelane_b32 v23, s39, 8 -; GFX942-NEXT: v_writelane_b32 v23, s40, 9 -; GFX942-NEXT: v_writelane_b32 v23, s41, 10 -; GFX942-NEXT: v_writelane_b32 v23, s42, 11 -; GFX942-NEXT: v_writelane_b32 v23, s43, 12 -; GFX942-NEXT: v_writelane_b32 v23, s44, 13 -; GFX942-NEXT: v_writelane_b32 v23, s45, 14 -; GFX942-NEXT: v_writelane_b32 v23, s46, 15 -; GFX942-NEXT: v_writelane_b32 v23, s47, 16 -; GFX942-NEXT: v_writelane_b32 v23, s48, 17 -; GFX942-NEXT: v_writelane_b32 v23, s49, 18 -; GFX942-NEXT: v_writelane_b32 v23, s50, 19 -; GFX942-NEXT: v_writelane_b32 v23, s51, 20 -; GFX942-NEXT: v_writelane_b32 v23, s52, 21 -; GFX942-NEXT: v_writelane_b32 v23, s53, 22 -; GFX942-NEXT: v_writelane_b32 v23, s54, 23 -; GFX942-NEXT: v_writelane_b32 v23, s55, 24 -; GFX942-NEXT: v_writelane_b32 v23, s56, 25 -; GFX942-NEXT: v_writelane_b32 v23, s57, 26 -; GFX942-NEXT: v_writelane_b32 v23, s58, 27 -; GFX942-NEXT: v_writelane_b32 v23, s59, 28 -; GFX942-NEXT: v_writelane_b32 v23, s60, 29 +; GFX942-NEXT: v_writelane_b32 v23, s48, 9 +; GFX942-NEXT: v_writelane_b32 v23, s49, 10 +; GFX942-NEXT: v_writelane_b32 v23, s50, 11 +; GFX942-NEXT: v_writelane_b32 v23, s51, 12 +; GFX942-NEXT: v_writelane_b32 v23, s52, 13 +; GFX942-NEXT: v_writelane_b32 v23, s53, 14 ; GFX942-NEXT: s_add_i32 s0, s32, 64 -; GFX942-NEXT: v_writelane_b32 v23, s61, 30 +; GFX942-NEXT: v_writelane_b32 v23, s54, 15 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: s_and_b64 s[60:61], 0, exec +; GFX942-NEXT: v_writelane_b32 v23, s55, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND @@ -356,28 +270,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s61, v23, 30 -; GFX942-NEXT: v_readlane_b32 s60, v23, 29 -; GFX942-NEXT: v_readlane_b32 s59, v23, 28 -; GFX942-NEXT: v_readlane_b32 s58, v23, 27 -; GFX942-NEXT: v_readlane_b32 s57, v23, 26 -; GFX942-NEXT: v_readlane_b32 s56, v23, 25 -; GFX942-NEXT: v_readlane_b32 s55, v23, 24 -; GFX942-NEXT: v_readlane_b32 s54, v23, 23 -; GFX942-NEXT: v_readlane_b32 s53, v23, 22 -; GFX942-NEXT: v_readlane_b32 s52, v23, 21 -; GFX942-NEXT: v_readlane_b32 s51, v23, 20 -; GFX942-NEXT: v_readlane_b32 s50, v23, 19 -; GFX942-NEXT: v_readlane_b32 s49, v23, 18 -; GFX942-NEXT: v_readlane_b32 s48, v23, 17 -; GFX942-NEXT: v_readlane_b32 s47, v23, 16 -; GFX942-NEXT: v_readlane_b32 s46, v23, 15 -; GFX942-NEXT: v_readlane_b32 s45, v23, 14 -; GFX942-NEXT: v_readlane_b32 s44, v23, 13 -; GFX942-NEXT: v_readlane_b32 s43, v23, 12 -; GFX942-NEXT: v_readlane_b32 s42, v23, 11 -; GFX942-NEXT: v_readlane_b32 s41, v23, 10 -; GFX942-NEXT: v_readlane_b32 s40, v23, 9 +; GFX942-NEXT: v_readlane_b32 s55, v23, 16 +; GFX942-NEXT: v_readlane_b32 s54, v23, 15 +; GFX942-NEXT: v_readlane_b32 s53, v23, 14 +; GFX942-NEXT: v_readlane_b32 s52, v23, 13 +; GFX942-NEXT: v_readlane_b32 s51, v23, 12 +; GFX942-NEXT: v_readlane_b32 s50, v23, 11 +; GFX942-NEXT: v_readlane_b32 s49, v23, 10 +; GFX942-NEXT: v_readlane_b32 s48, v23, 9 ; GFX942-NEXT: v_readlane_b32 s39, v23, 8 ; GFX942-NEXT: v_readlane_b32 s38, v23, 7 ; GFX942-NEXT: v_readlane_b32 s37, v23, 6 @@ -417,55 +317,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_1-NEXT: v_writelane_b32 v23, s37, 6 ; GFX10_1-NEXT: v_writelane_b32 v23, s38, 7 ; GFX10_1-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v23, s40, 9 -; GFX10_1-NEXT: v_writelane_b32 v23, s41, 10 -; GFX10_1-NEXT: v_writelane_b32 v23, s42, 11 -; GFX10_1-NEXT: v_writelane_b32 v23, s43, 12 -; GFX10_1-NEXT: v_writelane_b32 v23, s44, 13 -; GFX10_1-NEXT: v_writelane_b32 v23, s45, 14 -; GFX10_1-NEXT: v_writelane_b32 v23, s46, 15 -; GFX10_1-NEXT: v_writelane_b32 v23, s47, 16 -; GFX10_1-NEXT: v_writelane_b32 v23, s48, 17 -; GFX10_1-NEXT: v_writelane_b32 v23, s49, 18 -; GFX10_1-NEXT: v_writelane_b32 v23, s50, 19 -; GFX10_1-NEXT: v_writelane_b32 v23, s51, 20 -; GFX10_1-NEXT: v_writelane_b32 v23, s52, 21 -; GFX10_1-NEXT: v_writelane_b32 v23, s53, 22 -; GFX10_1-NEXT: v_writelane_b32 v23, s54, 23 -; GFX10_1-NEXT: v_writelane_b32 v23, s55, 24 -; GFX10_1-NEXT: v_writelane_b32 v23, s56, 25 -; GFX10_1-NEXT: v_writelane_b32 v23, s57, 26 -; GFX10_1-NEXT: v_writelane_b32 v23, s58, 27 +; GFX10_1-NEXT: v_writelane_b32 v23, s48, 9 +; GFX10_1-NEXT: v_writelane_b32 v23, s49, 10 +; GFX10_1-NEXT: v_writelane_b32 v23, s50, 11 +; GFX10_1-NEXT: v_writelane_b32 v23, s51, 12 +; GFX10_1-NEXT: v_writelane_b32 v23, s52, 13 +; GFX10_1-NEXT: v_writelane_b32 v23, s53, 14 +; GFX10_1-NEXT: v_writelane_b32 v23, s54, 15 +; GFX10_1-NEXT: v_writelane_b32 v23, s55, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: v_lshrrev_b32_e64 v24, 5, s32 -; GFX10_1-NEXT: v_writelane_b32 v23, s59, 28 ; GFX10_1-NEXT: v_add_nc_u32_e32 v24, 0x4040, v24 ; GFX10_1-NEXT: v_readfirstlane_b32 s59, v24 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v23, 28 -; GFX10_1-NEXT: v_readlane_b32 s58, v23, 27 -; GFX10_1-NEXT: v_readlane_b32 s57, v23, 26 -; GFX10_1-NEXT: v_readlane_b32 s56, v23, 25 -; GFX10_1-NEXT: v_readlane_b32 s55, v23, 24 -; GFX10_1-NEXT: v_readlane_b32 s54, v23, 23 -; GFX10_1-NEXT: v_readlane_b32 s53, v23, 22 -; GFX10_1-NEXT: v_readlane_b32 s52, v23, 21 -; GFX10_1-NEXT: v_readlane_b32 s51, v23, 20 -; GFX10_1-NEXT: v_readlane_b32 s50, v23, 19 -; GFX10_1-NEXT: v_readlane_b32 s49, v23, 18 -; GFX10_1-NEXT: v_readlane_b32 s48, v23, 17 -; GFX10_1-NEXT: v_readlane_b32 s47, v23, 16 -; GFX10_1-NEXT: v_readlane_b32 s46, v23, 15 -; GFX10_1-NEXT: v_readlane_b32 s45, v23, 14 -; GFX10_1-NEXT: v_readlane_b32 s44, v23, 13 -; GFX10_1-NEXT: v_readlane_b32 s43, v23, 12 -; GFX10_1-NEXT: v_readlane_b32 s42, v23, 11 -; GFX10_1-NEXT: v_readlane_b32 s41, v23, 10 -; GFX10_1-NEXT: v_readlane_b32 s40, v23, 9 +; GFX10_1-NEXT: v_readlane_b32 s55, v23, 16 +; GFX10_1-NEXT: v_readlane_b32 s54, v23, 15 +; GFX10_1-NEXT: v_readlane_b32 s53, v23, 14 +; GFX10_1-NEXT: v_readlane_b32 s52, v23, 13 +; GFX10_1-NEXT: v_readlane_b32 s51, v23, 12 +; GFX10_1-NEXT: v_readlane_b32 s50, v23, 11 +; GFX10_1-NEXT: v_readlane_b32 s49, v23, 10 +; GFX10_1-NEXT: v_readlane_b32 s48, v23, 9 ; GFX10_1-NEXT: v_readlane_b32 s39, v23, 8 ; GFX10_1-NEXT: v_readlane_b32 s38, v23, 7 ; GFX10_1-NEXT: v_readlane_b32 s37, v23, 6 @@ -505,55 +381,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_3-NEXT: v_writelane_b32 v23, s37, 6 ; GFX10_3-NEXT: v_writelane_b32 v23, s38, 7 ; GFX10_3-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v23, s40, 9 -; GFX10_3-NEXT: v_writelane_b32 v23, s41, 10 -; GFX10_3-NEXT: v_writelane_b32 v23, s42, 11 -; GFX10_3-NEXT: v_writelane_b32 v23, s43, 12 -; GFX10_3-NEXT: v_writelane_b32 v23, s44, 13 -; GFX10_3-NEXT: v_writelane_b32 v23, s45, 14 -; GFX10_3-NEXT: v_writelane_b32 v23, s46, 15 -; GFX10_3-NEXT: v_writelane_b32 v23, s47, 16 -; GFX10_3-NEXT: v_writelane_b32 v23, s48, 17 -; GFX10_3-NEXT: v_writelane_b32 v23, s49, 18 -; GFX10_3-NEXT: v_writelane_b32 v23, s50, 19 -; GFX10_3-NEXT: v_writelane_b32 v23, s51, 20 -; GFX10_3-NEXT: v_writelane_b32 v23, s52, 21 -; GFX10_3-NEXT: v_writelane_b32 v23, s53, 22 -; GFX10_3-NEXT: v_writelane_b32 v23, s54, 23 -; GFX10_3-NEXT: v_writelane_b32 v23, s55, 24 -; GFX10_3-NEXT: v_writelane_b32 v23, s56, 25 -; GFX10_3-NEXT: v_writelane_b32 v23, s57, 26 -; GFX10_3-NEXT: v_writelane_b32 v23, s58, 27 +; GFX10_3-NEXT: v_writelane_b32 v23, s48, 9 +; GFX10_3-NEXT: v_writelane_b32 v23, s49, 10 +; GFX10_3-NEXT: v_writelane_b32 v23, s50, 11 +; GFX10_3-NEXT: v_writelane_b32 v23, s51, 12 +; GFX10_3-NEXT: v_writelane_b32 v23, s52, 13 +; GFX10_3-NEXT: v_writelane_b32 v23, s53, 14 +; GFX10_3-NEXT: v_writelane_b32 v23, s54, 15 +; GFX10_3-NEXT: v_writelane_b32 v23, s55, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: v_lshrrev_b32_e64 v24, 5, s32 -; GFX10_3-NEXT: v_writelane_b32 v23, s59, 28 ; GFX10_3-NEXT: v_add_nc_u32_e32 v24, 0x4040, v24 ; GFX10_3-NEXT: v_readfirstlane_b32 s59, v24 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v23, 28 -; GFX10_3-NEXT: v_readlane_b32 s58, v23, 27 -; GFX10_3-NEXT: v_readlane_b32 s57, v23, 26 -; GFX10_3-NEXT: v_readlane_b32 s56, v23, 25 -; GFX10_3-NEXT: v_readlane_b32 s55, v23, 24 -; GFX10_3-NEXT: v_readlane_b32 s54, v23, 23 -; GFX10_3-NEXT: v_readlane_b32 s53, v23, 22 -; GFX10_3-NEXT: v_readlane_b32 s52, v23, 21 -; GFX10_3-NEXT: v_readlane_b32 s51, v23, 20 -; GFX10_3-NEXT: v_readlane_b32 s50, v23, 19 -; GFX10_3-NEXT: v_readlane_b32 s49, v23, 18 -; GFX10_3-NEXT: v_readlane_b32 s48, v23, 17 -; GFX10_3-NEXT: v_readlane_b32 s47, v23, 16 -; GFX10_3-NEXT: v_readlane_b32 s46, v23, 15 -; GFX10_3-NEXT: v_readlane_b32 s45, v23, 14 -; GFX10_3-NEXT: v_readlane_b32 s44, v23, 13 -; GFX10_3-NEXT: v_readlane_b32 s43, v23, 12 -; GFX10_3-NEXT: v_readlane_b32 s42, v23, 11 -; GFX10_3-NEXT: v_readlane_b32 s41, v23, 10 -; GFX10_3-NEXT: v_readlane_b32 s40, v23, 9 +; GFX10_3-NEXT: v_readlane_b32 s55, v23, 16 +; GFX10_3-NEXT: v_readlane_b32 s54, v23, 15 +; GFX10_3-NEXT: v_readlane_b32 s53, v23, 14 +; GFX10_3-NEXT: v_readlane_b32 s52, v23, 13 +; GFX10_3-NEXT: v_readlane_b32 s51, v23, 12 +; GFX10_3-NEXT: v_readlane_b32 s50, v23, 11 +; GFX10_3-NEXT: v_readlane_b32 s49, v23, 10 +; GFX10_3-NEXT: v_readlane_b32 s48, v23, 9 ; GFX10_3-NEXT: v_readlane_b32 s39, v23, 8 ; GFX10_3-NEXT: v_readlane_b32 s38, v23, 7 ; GFX10_3-NEXT: v_readlane_b32 s37, v23, 6 @@ -593,61 +445,33 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX11-NEXT: v_writelane_b32 v23, s37, 6 ; GFX11-NEXT: v_writelane_b32 v23, s38, 7 ; GFX11-NEXT: v_writelane_b32 v23, s39, 8 -; GFX11-NEXT: v_writelane_b32 v23, s40, 9 -; GFX11-NEXT: v_writelane_b32 v23, s41, 10 -; GFX11-NEXT: v_writelane_b32 v23, s42, 11 -; GFX11-NEXT: v_writelane_b32 v23, s43, 12 -; GFX11-NEXT: v_writelane_b32 v23, s44, 13 -; GFX11-NEXT: v_writelane_b32 v23, s45, 14 -; GFX11-NEXT: v_writelane_b32 v23, s46, 15 -; GFX11-NEXT: v_writelane_b32 v23, s47, 16 -; GFX11-NEXT: v_writelane_b32 v23, s48, 17 -; GFX11-NEXT: v_writelane_b32 v23, s49, 18 -; GFX11-NEXT: v_writelane_b32 v23, s50, 19 -; GFX11-NEXT: v_writelane_b32 v23, s51, 20 -; GFX11-NEXT: v_writelane_b32 v23, s52, 21 -; GFX11-NEXT: v_writelane_b32 v23, s53, 22 -; GFX11-NEXT: v_writelane_b32 v23, s54, 23 -; GFX11-NEXT: v_writelane_b32 v23, s55, 24 -; GFX11-NEXT: v_writelane_b32 v23, s56, 25 -; GFX11-NEXT: v_writelane_b32 v23, s57, 26 -; GFX11-NEXT: v_writelane_b32 v23, s58, 27 +; GFX11-NEXT: v_writelane_b32 v23, s48, 9 +; GFX11-NEXT: v_writelane_b32 v23, s49, 10 +; GFX11-NEXT: v_writelane_b32 v23, s50, 11 +; GFX11-NEXT: v_writelane_b32 v23, s51, 12 +; GFX11-NEXT: v_writelane_b32 v23, s52, 13 +; GFX11-NEXT: v_writelane_b32 v23, s53, 14 +; GFX11-NEXT: v_writelane_b32 v23, s54, 15 +; GFX11-NEXT: v_writelane_b32 v23, s55, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_addc_u32 s32, s32, 0x4040 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: v_writelane_b32 v23, s59, 28 -; GFX11-NEXT: s_bitset0_b32 s32, 0 -; GFX11-NEXT: s_mov_b32 s59, s32 -; GFX11-NEXT: s_addc_u32 s32, s32, 0xffffbfc0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 +; GFX11-NEXT: s_addc_u32 s60, s32, 0x4040 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_bitcmp1_b32 s60, 0 +; GFX11-NEXT: s_bitset0_b32 s60, 0 +; GFX11-NEXT: s_mov_b32 s59, s60 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v23, 28 -; GFX11-NEXT: v_readlane_b32 s58, v23, 27 -; GFX11-NEXT: v_readlane_b32 s57, v23, 26 -; GFX11-NEXT: v_readlane_b32 s56, v23, 25 -; GFX11-NEXT: v_readlane_b32 s55, v23, 24 -; GFX11-NEXT: v_readlane_b32 s54, v23, 23 -; GFX11-NEXT: v_readlane_b32 s53, v23, 22 -; GFX11-NEXT: v_readlane_b32 s52, v23, 21 -; GFX11-NEXT: v_readlane_b32 s51, v23, 20 -; GFX11-NEXT: v_readlane_b32 s50, v23, 19 -; GFX11-NEXT: v_readlane_b32 s49, v23, 18 -; GFX11-NEXT: v_readlane_b32 s48, v23, 17 -; GFX11-NEXT: v_readlane_b32 s47, v23, 16 -; GFX11-NEXT: v_readlane_b32 s46, v23, 15 -; GFX11-NEXT: v_readlane_b32 s45, v23, 14 -; GFX11-NEXT: v_readlane_b32 s44, v23, 13 -; GFX11-NEXT: v_readlane_b32 s43, v23, 12 -; GFX11-NEXT: v_readlane_b32 s42, v23, 11 -; GFX11-NEXT: v_readlane_b32 s41, v23, 10 -; GFX11-NEXT: v_readlane_b32 s40, v23, 9 +; GFX11-NEXT: v_readlane_b32 s55, v23, 16 +; GFX11-NEXT: v_readlane_b32 s54, v23, 15 +; GFX11-NEXT: v_readlane_b32 s53, v23, 14 +; GFX11-NEXT: v_readlane_b32 s52, v23, 13 +; GFX11-NEXT: v_readlane_b32 s51, v23, 12 +; GFX11-NEXT: v_readlane_b32 s50, v23, 11 +; GFX11-NEXT: v_readlane_b32 s49, v23, 10 +; GFX11-NEXT: v_readlane_b32 s48, v23, 9 ; GFX11-NEXT: v_readlane_b32 s39, v23, 8 ; GFX11-NEXT: v_readlane_b32 s38, v23, 7 ; GFX11-NEXT: v_readlane_b32 s37, v23, 6 @@ -689,62 +513,34 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: v_writelane_b32 v23, s37, 6 ; GFX12-NEXT: v_writelane_b32 v23, s38, 7 ; GFX12-NEXT: v_writelane_b32 v23, s39, 8 -; GFX12-NEXT: v_writelane_b32 v23, s40, 9 -; GFX12-NEXT: v_writelane_b32 v23, s41, 10 -; GFX12-NEXT: v_writelane_b32 v23, s42, 11 -; GFX12-NEXT: v_writelane_b32 v23, s43, 12 -; GFX12-NEXT: v_writelane_b32 v23, s44, 13 -; GFX12-NEXT: v_writelane_b32 v23, s45, 14 -; GFX12-NEXT: v_writelane_b32 v23, s46, 15 -; GFX12-NEXT: v_writelane_b32 v23, s47, 16 -; GFX12-NEXT: v_writelane_b32 v23, s48, 17 -; GFX12-NEXT: v_writelane_b32 v23, s49, 18 -; GFX12-NEXT: v_writelane_b32 v23, s50, 19 -; GFX12-NEXT: v_writelane_b32 v23, s51, 20 -; GFX12-NEXT: v_writelane_b32 v23, s52, 21 -; GFX12-NEXT: v_writelane_b32 v23, s53, 22 -; GFX12-NEXT: v_writelane_b32 v23, s54, 23 -; GFX12-NEXT: v_writelane_b32 v23, s55, 24 -; GFX12-NEXT: v_writelane_b32 v23, s56, 25 -; GFX12-NEXT: v_writelane_b32 v23, s57, 26 -; GFX12-NEXT: v_writelane_b32 v23, s58, 27 +; GFX12-NEXT: v_writelane_b32 v23, s48, 9 +; GFX12-NEXT: v_writelane_b32 v23, s49, 10 +; GFX12-NEXT: v_writelane_b32 v23, s50, 11 +; GFX12-NEXT: v_writelane_b32 v23, s51, 12 +; GFX12-NEXT: v_writelane_b32 v23, s52, 13 +; GFX12-NEXT: v_writelane_b32 v23, s53, 14 +; GFX12-NEXT: v_writelane_b32 v23, s54, 15 +; GFX12-NEXT: v_writelane_b32 v23, s55, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0x4000 +; GFX12-NEXT: s_add_co_ci_u32 s60, s32, 0x4000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bitcmp1_b32 s32, 0 -; GFX12-NEXT: v_writelane_b32 v23, s59, 28 -; GFX12-NEXT: s_bitset0_b32 s32, 0 +; GFX12-NEXT: s_bitcmp1_b32 s60, 0 +; GFX12-NEXT: s_bitset0_b32 s60, 0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s59, s32 -; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0xffffc000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bitcmp1_b32 s32, 0 -; GFX12-NEXT: s_bitset0_b32 s32, 0 +; GFX12-NEXT: s_mov_b32 s59, s60 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v23, 28 -; GFX12-NEXT: v_readlane_b32 s58, v23, 27 -; GFX12-NEXT: v_readlane_b32 s57, v23, 26 -; GFX12-NEXT: v_readlane_b32 s56, v23, 25 -; GFX12-NEXT: v_readlane_b32 s55, v23, 24 -; GFX12-NEXT: v_readlane_b32 s54, v23, 23 -; GFX12-NEXT: v_readlane_b32 s53, v23, 22 -; GFX12-NEXT: v_readlane_b32 s52, v23, 21 -; GFX12-NEXT: v_readlane_b32 s51, v23, 20 -; GFX12-NEXT: v_readlane_b32 s50, v23, 19 -; GFX12-NEXT: v_readlane_b32 s49, v23, 18 -; GFX12-NEXT: v_readlane_b32 s48, v23, 17 -; GFX12-NEXT: v_readlane_b32 s47, v23, 16 -; GFX12-NEXT: v_readlane_b32 s46, v23, 15 -; GFX12-NEXT: v_readlane_b32 s45, v23, 14 -; GFX12-NEXT: v_readlane_b32 s44, v23, 13 -; GFX12-NEXT: v_readlane_b32 s43, v23, 12 -; GFX12-NEXT: v_readlane_b32 s42, v23, 11 -; GFX12-NEXT: v_readlane_b32 s41, v23, 10 -; GFX12-NEXT: v_readlane_b32 s40, v23, 9 +; GFX12-NEXT: v_readlane_b32 s55, v23, 16 +; GFX12-NEXT: v_readlane_b32 s54, v23, 15 +; GFX12-NEXT: v_readlane_b32 s53, v23, 14 +; GFX12-NEXT: v_readlane_b32 s52, v23, 13 +; GFX12-NEXT: v_readlane_b32 s51, v23, 12 +; GFX12-NEXT: v_readlane_b32 s50, v23, 11 +; GFX12-NEXT: v_readlane_b32 s49, v23, 10 +; GFX12-NEXT: v_readlane_b32 s48, v23, 9 ; GFX12-NEXT: v_readlane_b32 s39, v23, 8 ; GFX12-NEXT: v_readlane_b32 s38, v23, 7 ; GFX12-NEXT: v_readlane_b32 s37, v23, 6 @@ -819,56 +615,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX7-NEXT: v_writelane_b32 v21, s37, 6 ; GFX7-NEXT: v_writelane_b32 v21, s38, 7 ; GFX7-NEXT: v_writelane_b32 v21, s39, 8 -; GFX7-NEXT: v_writelane_b32 v21, s40, 9 -; GFX7-NEXT: v_writelane_b32 v21, s41, 10 -; GFX7-NEXT: v_writelane_b32 v21, s42, 11 -; GFX7-NEXT: v_writelane_b32 v21, s43, 12 -; GFX7-NEXT: v_writelane_b32 v21, s44, 13 -; GFX7-NEXT: v_writelane_b32 v21, s45, 14 -; GFX7-NEXT: v_writelane_b32 v21, s46, 15 -; GFX7-NEXT: v_writelane_b32 v21, s47, 16 -; GFX7-NEXT: v_writelane_b32 v21, s48, 17 -; GFX7-NEXT: v_writelane_b32 v21, s49, 18 -; GFX7-NEXT: v_writelane_b32 v21, s50, 19 -; GFX7-NEXT: v_writelane_b32 v21, s51, 20 -; GFX7-NEXT: v_writelane_b32 v21, s52, 21 -; GFX7-NEXT: v_writelane_b32 v21, s53, 22 -; GFX7-NEXT: v_writelane_b32 v21, s54, 23 -; GFX7-NEXT: v_writelane_b32 v21, s55, 24 -; GFX7-NEXT: v_writelane_b32 v21, s56, 25 -; GFX7-NEXT: v_writelane_b32 v21, s57, 26 +; GFX7-NEXT: v_writelane_b32 v21, s48, 9 +; GFX7-NEXT: v_writelane_b32 v21, s49, 10 +; GFX7-NEXT: v_writelane_b32 v21, s50, 11 +; GFX7-NEXT: v_writelane_b32 v21, s51, 12 +; GFX7-NEXT: v_writelane_b32 v21, s52, 13 +; GFX7-NEXT: v_writelane_b32 v21, s53, 14 +; GFX7-NEXT: v_writelane_b32 v21, s54, 15 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v21, s58, 27 +; GFX7-NEXT: v_writelane_b32 v21, s55, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX7-NEXT: ;;#ASMEND ; GFX7-NEXT: v_mad_u32_u24 v22, 16, 64, s32 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 6, v22 -; GFX7-NEXT: v_writelane_b32 v21, s59, 28 ; GFX7-NEXT: v_readfirstlane_b32 s59, v22 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s59, v21, 28 -; GFX7-NEXT: v_readlane_b32 s58, v21, 27 -; GFX7-NEXT: v_readlane_b32 s57, v21, 26 -; GFX7-NEXT: v_readlane_b32 s56, v21, 25 -; GFX7-NEXT: v_readlane_b32 s55, v21, 24 -; GFX7-NEXT: v_readlane_b32 s54, v21, 23 -; GFX7-NEXT: v_readlane_b32 s53, v21, 22 -; GFX7-NEXT: v_readlane_b32 s52, v21, 21 -; GFX7-NEXT: v_readlane_b32 s51, v21, 20 -; GFX7-NEXT: v_readlane_b32 s50, v21, 19 -; GFX7-NEXT: v_readlane_b32 s49, v21, 18 -; GFX7-NEXT: v_readlane_b32 s48, v21, 17 -; GFX7-NEXT: v_readlane_b32 s47, v21, 16 -; GFX7-NEXT: v_readlane_b32 s46, v21, 15 -; GFX7-NEXT: v_readlane_b32 s45, v21, 14 -; GFX7-NEXT: v_readlane_b32 s44, v21, 13 -; GFX7-NEXT: v_readlane_b32 s43, v21, 12 -; GFX7-NEXT: v_readlane_b32 s42, v21, 11 -; GFX7-NEXT: v_readlane_b32 s41, v21, 10 -; GFX7-NEXT: v_readlane_b32 s40, v21, 9 +; GFX7-NEXT: v_readlane_b32 s55, v21, 16 +; GFX7-NEXT: v_readlane_b32 s54, v21, 15 +; GFX7-NEXT: v_readlane_b32 s53, v21, 14 +; GFX7-NEXT: v_readlane_b32 s52, v21, 13 +; GFX7-NEXT: v_readlane_b32 s51, v21, 12 +; GFX7-NEXT: v_readlane_b32 s50, v21, 11 +; GFX7-NEXT: v_readlane_b32 s49, v21, 10 +; GFX7-NEXT: v_readlane_b32 s48, v21, 9 ; GFX7-NEXT: v_readlane_b32 s39, v21, 8 ; GFX7-NEXT: v_readlane_b32 s38, v21, 7 ; GFX7-NEXT: v_readlane_b32 s37, v21, 6 @@ -901,56 +673,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX8-NEXT: v_writelane_b32 v21, s37, 6 ; GFX8-NEXT: v_writelane_b32 v21, s38, 7 ; GFX8-NEXT: v_writelane_b32 v21, s39, 8 -; GFX8-NEXT: v_writelane_b32 v21, s40, 9 -; GFX8-NEXT: v_writelane_b32 v21, s41, 10 -; GFX8-NEXT: v_writelane_b32 v21, s42, 11 -; GFX8-NEXT: v_writelane_b32 v21, s43, 12 -; GFX8-NEXT: v_writelane_b32 v21, s44, 13 -; GFX8-NEXT: v_writelane_b32 v21, s45, 14 -; GFX8-NEXT: v_writelane_b32 v21, s46, 15 -; GFX8-NEXT: v_writelane_b32 v21, s47, 16 -; GFX8-NEXT: v_writelane_b32 v21, s48, 17 -; GFX8-NEXT: v_writelane_b32 v21, s49, 18 -; GFX8-NEXT: v_writelane_b32 v21, s50, 19 -; GFX8-NEXT: v_writelane_b32 v21, s51, 20 -; GFX8-NEXT: v_writelane_b32 v21, s52, 21 -; GFX8-NEXT: v_writelane_b32 v21, s53, 22 -; GFX8-NEXT: v_writelane_b32 v21, s54, 23 -; GFX8-NEXT: v_writelane_b32 v21, s55, 24 -; GFX8-NEXT: v_writelane_b32 v21, s56, 25 -; GFX8-NEXT: v_writelane_b32 v21, s57, 26 +; GFX8-NEXT: v_writelane_b32 v21, s48, 9 +; GFX8-NEXT: v_writelane_b32 v21, s49, 10 +; GFX8-NEXT: v_writelane_b32 v21, s50, 11 +; GFX8-NEXT: v_writelane_b32 v21, s51, 12 +; GFX8-NEXT: v_writelane_b32 v21, s52, 13 +; GFX8-NEXT: v_writelane_b32 v21, s53, 14 +; GFX8-NEXT: v_writelane_b32 v21, s54, 15 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec -; GFX8-NEXT: v_writelane_b32 v21, s58, 27 +; GFX8-NEXT: v_writelane_b32 v21, s55, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_mad_u32_u24 v22, 16, 64, s32 ; GFX8-NEXT: v_lshrrev_b32_e32 v22, 6, v22 -; GFX8-NEXT: v_writelane_b32 v21, s59, 28 ; GFX8-NEXT: v_readfirstlane_b32 s59, v22 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v21, 28 -; GFX8-NEXT: v_readlane_b32 s58, v21, 27 -; GFX8-NEXT: v_readlane_b32 s57, v21, 26 -; GFX8-NEXT: v_readlane_b32 s56, v21, 25 -; GFX8-NEXT: v_readlane_b32 s55, v21, 24 -; GFX8-NEXT: v_readlane_b32 s54, v21, 23 -; GFX8-NEXT: v_readlane_b32 s53, v21, 22 -; GFX8-NEXT: v_readlane_b32 s52, v21, 21 -; GFX8-NEXT: v_readlane_b32 s51, v21, 20 -; GFX8-NEXT: v_readlane_b32 s50, v21, 19 -; GFX8-NEXT: v_readlane_b32 s49, v21, 18 -; GFX8-NEXT: v_readlane_b32 s48, v21, 17 -; GFX8-NEXT: v_readlane_b32 s47, v21, 16 -; GFX8-NEXT: v_readlane_b32 s46, v21, 15 -; GFX8-NEXT: v_readlane_b32 s45, v21, 14 -; GFX8-NEXT: v_readlane_b32 s44, v21, 13 -; GFX8-NEXT: v_readlane_b32 s43, v21, 12 -; GFX8-NEXT: v_readlane_b32 s42, v21, 11 -; GFX8-NEXT: v_readlane_b32 s41, v21, 10 -; GFX8-NEXT: v_readlane_b32 s40, v21, 9 +; GFX8-NEXT: v_readlane_b32 s55, v21, 16 +; GFX8-NEXT: v_readlane_b32 s54, v21, 15 +; GFX8-NEXT: v_readlane_b32 s53, v21, 14 +; GFX8-NEXT: v_readlane_b32 s52, v21, 13 +; GFX8-NEXT: v_readlane_b32 s51, v21, 12 +; GFX8-NEXT: v_readlane_b32 s50, v21, 11 +; GFX8-NEXT: v_readlane_b32 s49, v21, 10 +; GFX8-NEXT: v_readlane_b32 s48, v21, 9 ; GFX8-NEXT: v_readlane_b32 s39, v21, 8 ; GFX8-NEXT: v_readlane_b32 s38, v21, 7 ; GFX8-NEXT: v_readlane_b32 s37, v21, 6 @@ -983,56 +731,32 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: v_writelane_b32 v21, s37, 6 ; GFX900-NEXT: v_writelane_b32 v21, s38, 7 ; GFX900-NEXT: v_writelane_b32 v21, s39, 8 -; GFX900-NEXT: v_writelane_b32 v21, s40, 9 -; GFX900-NEXT: v_writelane_b32 v21, s41, 10 -; GFX900-NEXT: v_writelane_b32 v21, s42, 11 -; GFX900-NEXT: v_writelane_b32 v21, s43, 12 -; GFX900-NEXT: v_writelane_b32 v21, s44, 13 -; GFX900-NEXT: v_writelane_b32 v21, s45, 14 -; GFX900-NEXT: v_writelane_b32 v21, s46, 15 -; GFX900-NEXT: v_writelane_b32 v21, s47, 16 -; GFX900-NEXT: v_writelane_b32 v21, s48, 17 -; GFX900-NEXT: v_writelane_b32 v21, s49, 18 -; GFX900-NEXT: v_writelane_b32 v21, s50, 19 -; GFX900-NEXT: v_writelane_b32 v21, s51, 20 -; GFX900-NEXT: v_writelane_b32 v21, s52, 21 -; GFX900-NEXT: v_writelane_b32 v21, s53, 22 -; GFX900-NEXT: v_writelane_b32 v21, s54, 23 -; GFX900-NEXT: v_writelane_b32 v21, s55, 24 -; GFX900-NEXT: v_writelane_b32 v21, s56, 25 -; GFX900-NEXT: v_writelane_b32 v21, s57, 26 +; GFX900-NEXT: v_writelane_b32 v21, s48, 9 +; GFX900-NEXT: v_writelane_b32 v21, s49, 10 +; GFX900-NEXT: v_writelane_b32 v21, s50, 11 +; GFX900-NEXT: v_writelane_b32 v21, s51, 12 +; GFX900-NEXT: v_writelane_b32 v21, s52, 13 +; GFX900-NEXT: v_writelane_b32 v21, s53, 14 +; GFX900-NEXT: v_writelane_b32 v21, s54, 15 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec -; GFX900-NEXT: v_writelane_b32 v21, s58, 27 +; GFX900-NEXT: v_writelane_b32 v21, s55, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_lshrrev_b32_e64 v22, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v22, 16, v22 -; GFX900-NEXT: v_writelane_b32 v21, s59, 28 ; GFX900-NEXT: v_readfirstlane_b32 s59, v22 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v21, 28 -; GFX900-NEXT: v_readlane_b32 s58, v21, 27 -; GFX900-NEXT: v_readlane_b32 s57, v21, 26 -; GFX900-NEXT: v_readlane_b32 s56, v21, 25 -; GFX900-NEXT: v_readlane_b32 s55, v21, 24 -; GFX900-NEXT: v_readlane_b32 s54, v21, 23 -; GFX900-NEXT: v_readlane_b32 s53, v21, 22 -; GFX900-NEXT: v_readlane_b32 s52, v21, 21 -; GFX900-NEXT: v_readlane_b32 s51, v21, 20 -; GFX900-NEXT: v_readlane_b32 s50, v21, 19 -; GFX900-NEXT: v_readlane_b32 s49, v21, 18 -; GFX900-NEXT: v_readlane_b32 s48, v21, 17 -; GFX900-NEXT: v_readlane_b32 s47, v21, 16 -; GFX900-NEXT: v_readlane_b32 s46, v21, 15 -; GFX900-NEXT: v_readlane_b32 s45, v21, 14 -; GFX900-NEXT: v_readlane_b32 s44, v21, 13 -; GFX900-NEXT: v_readlane_b32 s43, v21, 12 -; GFX900-NEXT: v_readlane_b32 s42, v21, 11 -; GFX900-NEXT: v_readlane_b32 s41, v21, 10 -; GFX900-NEXT: v_readlane_b32 s40, v21, 9 +; GFX900-NEXT: v_readlane_b32 s55, v21, 16 +; GFX900-NEXT: v_readlane_b32 s54, v21, 15 +; GFX900-NEXT: v_readlane_b32 s53, v21, 14 +; GFX900-NEXT: v_readlane_b32 s52, v21, 13 +; GFX900-NEXT: v_readlane_b32 s51, v21, 12 +; GFX900-NEXT: v_readlane_b32 s50, v21, 11 +; GFX900-NEXT: v_readlane_b32 s49, v21, 10 +; GFX900-NEXT: v_readlane_b32 s48, v21, 9 ; GFX900-NEXT: v_readlane_b32 s39, v21, 8 ; GFX900-NEXT: v_readlane_b32 s38, v21, 7 ; GFX900-NEXT: v_readlane_b32 s37, v21, 6 @@ -1065,29 +789,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX942-NEXT: v_writelane_b32 v21, s37, 6 ; GFX942-NEXT: v_writelane_b32 v21, s38, 7 ; GFX942-NEXT: v_writelane_b32 v21, s39, 8 -; GFX942-NEXT: v_writelane_b32 v21, s40, 9 -; GFX942-NEXT: v_writelane_b32 v21, s41, 10 -; GFX942-NEXT: v_writelane_b32 v21, s42, 11 -; GFX942-NEXT: v_writelane_b32 v21, s43, 12 -; GFX942-NEXT: v_writelane_b32 v21, s44, 13 -; GFX942-NEXT: v_writelane_b32 v21, s45, 14 -; GFX942-NEXT: v_writelane_b32 v21, s46, 15 -; GFX942-NEXT: v_writelane_b32 v21, s47, 16 -; GFX942-NEXT: v_writelane_b32 v21, s48, 17 -; GFX942-NEXT: v_writelane_b32 v21, s49, 18 -; GFX942-NEXT: v_writelane_b32 v21, s50, 19 -; GFX942-NEXT: v_writelane_b32 v21, s51, 20 -; GFX942-NEXT: v_writelane_b32 v21, s52, 21 -; GFX942-NEXT: v_writelane_b32 v21, s53, 22 -; GFX942-NEXT: v_writelane_b32 v21, s54, 23 -; GFX942-NEXT: v_writelane_b32 v21, s55, 24 -; GFX942-NEXT: v_writelane_b32 v21, s56, 25 -; GFX942-NEXT: v_writelane_b32 v21, s57, 26 -; GFX942-NEXT: v_writelane_b32 v21, s58, 27 -; GFX942-NEXT: v_writelane_b32 v21, s59, 28 -; GFX942-NEXT: v_writelane_b32 v21, s60, 29 -; GFX942-NEXT: v_writelane_b32 v21, s61, 30 +; GFX942-NEXT: v_writelane_b32 v21, s48, 9 +; GFX942-NEXT: v_writelane_b32 v21, s49, 10 +; GFX942-NEXT: v_writelane_b32 v21, s50, 11 +; GFX942-NEXT: v_writelane_b32 v21, s51, 12 +; GFX942-NEXT: v_writelane_b32 v21, s52, 13 +; GFX942-NEXT: v_writelane_b32 v21, s53, 14 +; GFX942-NEXT: v_writelane_b32 v21, s54, 15 ; GFX942-NEXT: s_and_b64 s[60:61], 0, exec +; GFX942-NEXT: v_writelane_b32 v21, s55, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX942-NEXT: ;;#ASMEND @@ -1098,28 +808,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s61, v21, 30 -; GFX942-NEXT: v_readlane_b32 s60, v21, 29 -; GFX942-NEXT: v_readlane_b32 s59, v21, 28 -; GFX942-NEXT: v_readlane_b32 s58, v21, 27 -; GFX942-NEXT: v_readlane_b32 s57, v21, 26 -; GFX942-NEXT: v_readlane_b32 s56, v21, 25 -; GFX942-NEXT: v_readlane_b32 s55, v21, 24 -; GFX942-NEXT: v_readlane_b32 s54, v21, 23 -; GFX942-NEXT: v_readlane_b32 s53, v21, 22 -; GFX942-NEXT: v_readlane_b32 s52, v21, 21 -; GFX942-NEXT: v_readlane_b32 s51, v21, 20 -; GFX942-NEXT: v_readlane_b32 s50, v21, 19 -; GFX942-NEXT: v_readlane_b32 s49, v21, 18 -; GFX942-NEXT: v_readlane_b32 s48, v21, 17 -; GFX942-NEXT: v_readlane_b32 s47, v21, 16 -; GFX942-NEXT: v_readlane_b32 s46, v21, 15 -; GFX942-NEXT: v_readlane_b32 s45, v21, 14 -; GFX942-NEXT: v_readlane_b32 s44, v21, 13 -; GFX942-NEXT: v_readlane_b32 s43, v21, 12 -; GFX942-NEXT: v_readlane_b32 s42, v21, 11 -; GFX942-NEXT: v_readlane_b32 s41, v21, 10 -; GFX942-NEXT: v_readlane_b32 s40, v21, 9 +; GFX942-NEXT: v_readlane_b32 s55, v21, 16 +; GFX942-NEXT: v_readlane_b32 s54, v21, 15 +; GFX942-NEXT: v_readlane_b32 s53, v21, 14 +; GFX942-NEXT: v_readlane_b32 s52, v21, 13 +; GFX942-NEXT: v_readlane_b32 s51, v21, 12 +; GFX942-NEXT: v_readlane_b32 s50, v21, 11 +; GFX942-NEXT: v_readlane_b32 s49, v21, 10 +; GFX942-NEXT: v_readlane_b32 s48, v21, 9 ; GFX942-NEXT: v_readlane_b32 s39, v21, 8 ; GFX942-NEXT: v_readlane_b32 s38, v21, 7 ; GFX942-NEXT: v_readlane_b32 s37, v21, 6 @@ -1145,6 +841,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_writelane_b32 v21, s30, 0 +; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo ; GFX10_1-NEXT: v_writelane_b32 v21, s31, 1 ; GFX10_1-NEXT: v_writelane_b32 v21, s33, 2 ; GFX10_1-NEXT: v_writelane_b32 v21, s34, 3 @@ -1153,56 +850,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: v_writelane_b32 v21, s37, 6 ; GFX10_1-NEXT: v_writelane_b32 v21, s38, 7 ; GFX10_1-NEXT: v_writelane_b32 v21, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v21, s40, 9 -; GFX10_1-NEXT: v_writelane_b32 v21, s41, 10 -; GFX10_1-NEXT: v_writelane_b32 v21, s42, 11 -; GFX10_1-NEXT: v_writelane_b32 v21, s43, 12 -; GFX10_1-NEXT: v_writelane_b32 v21, s44, 13 -; GFX10_1-NEXT: v_writelane_b32 v21, s45, 14 -; GFX10_1-NEXT: v_writelane_b32 v21, s46, 15 -; GFX10_1-NEXT: v_writelane_b32 v21, s47, 16 -; GFX10_1-NEXT: v_writelane_b32 v21, s48, 17 -; GFX10_1-NEXT: v_writelane_b32 v21, s49, 18 -; GFX10_1-NEXT: v_writelane_b32 v21, s50, 19 -; GFX10_1-NEXT: v_writelane_b32 v21, s51, 20 -; GFX10_1-NEXT: v_writelane_b32 v21, s52, 21 -; GFX10_1-NEXT: v_writelane_b32 v21, s53, 22 -; GFX10_1-NEXT: v_writelane_b32 v21, s54, 23 -; GFX10_1-NEXT: v_writelane_b32 v21, s55, 24 -; GFX10_1-NEXT: v_writelane_b32 v21, s56, 25 -; GFX10_1-NEXT: v_writelane_b32 v21, s57, 26 -; GFX10_1-NEXT: v_writelane_b32 v21, s58, 27 +; GFX10_1-NEXT: v_writelane_b32 v21, s48, 9 +; GFX10_1-NEXT: v_writelane_b32 v21, s49, 10 +; GFX10_1-NEXT: v_writelane_b32 v21, s50, 11 +; GFX10_1-NEXT: v_writelane_b32 v21, s51, 12 +; GFX10_1-NEXT: v_writelane_b32 v21, s52, 13 +; GFX10_1-NEXT: v_writelane_b32 v21, s53, 14 +; GFX10_1-NEXT: v_writelane_b32 v21, s54, 15 +; GFX10_1-NEXT: v_writelane_b32 v21, s55, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: v_lshrrev_b32_e64 v22, 5, s32 -; GFX10_1-NEXT: v_writelane_b32 v21, s59, 28 -; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo ; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 16, v22 ; GFX10_1-NEXT: v_readfirstlane_b32 s59, v22 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v21, 28 -; GFX10_1-NEXT: v_readlane_b32 s58, v21, 27 -; GFX10_1-NEXT: v_readlane_b32 s57, v21, 26 -; GFX10_1-NEXT: v_readlane_b32 s56, v21, 25 -; GFX10_1-NEXT: v_readlane_b32 s55, v21, 24 -; GFX10_1-NEXT: v_readlane_b32 s54, v21, 23 -; GFX10_1-NEXT: v_readlane_b32 s53, v21, 22 -; GFX10_1-NEXT: v_readlane_b32 s52, v21, 21 -; GFX10_1-NEXT: v_readlane_b32 s51, v21, 20 -; GFX10_1-NEXT: v_readlane_b32 s50, v21, 19 -; GFX10_1-NEXT: v_readlane_b32 s49, v21, 18 -; GFX10_1-NEXT: v_readlane_b32 s48, v21, 17 -; GFX10_1-NEXT: v_readlane_b32 s47, v21, 16 -; GFX10_1-NEXT: v_readlane_b32 s46, v21, 15 -; GFX10_1-NEXT: v_readlane_b32 s45, v21, 14 -; GFX10_1-NEXT: v_readlane_b32 s44, v21, 13 -; GFX10_1-NEXT: v_readlane_b32 s43, v21, 12 -; GFX10_1-NEXT: v_readlane_b32 s42, v21, 11 -; GFX10_1-NEXT: v_readlane_b32 s41, v21, 10 -; GFX10_1-NEXT: v_readlane_b32 s40, v21, 9 +; GFX10_1-NEXT: v_readlane_b32 s55, v21, 16 +; GFX10_1-NEXT: v_readlane_b32 s54, v21, 15 +; GFX10_1-NEXT: v_readlane_b32 s53, v21, 14 +; GFX10_1-NEXT: v_readlane_b32 s52, v21, 13 +; GFX10_1-NEXT: v_readlane_b32 s51, v21, 12 +; GFX10_1-NEXT: v_readlane_b32 s50, v21, 11 +; GFX10_1-NEXT: v_readlane_b32 s49, v21, 10 +; GFX10_1-NEXT: v_readlane_b32 s48, v21, 9 ; GFX10_1-NEXT: v_readlane_b32 s39, v21, 8 ; GFX10_1-NEXT: v_readlane_b32 s38, v21, 7 ; GFX10_1-NEXT: v_readlane_b32 s37, v21, 6 @@ -1228,6 +900,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_writelane_b32 v21, s30, 0 +; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo ; GFX10_3-NEXT: v_writelane_b32 v21, s31, 1 ; GFX10_3-NEXT: v_writelane_b32 v21, s33, 2 ; GFX10_3-NEXT: v_writelane_b32 v21, s34, 3 @@ -1236,56 +909,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: v_writelane_b32 v21, s37, 6 ; GFX10_3-NEXT: v_writelane_b32 v21, s38, 7 ; GFX10_3-NEXT: v_writelane_b32 v21, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v21, s40, 9 -; GFX10_3-NEXT: v_writelane_b32 v21, s41, 10 -; GFX10_3-NEXT: v_writelane_b32 v21, s42, 11 -; GFX10_3-NEXT: v_writelane_b32 v21, s43, 12 -; GFX10_3-NEXT: v_writelane_b32 v21, s44, 13 -; GFX10_3-NEXT: v_writelane_b32 v21, s45, 14 -; GFX10_3-NEXT: v_writelane_b32 v21, s46, 15 -; GFX10_3-NEXT: v_writelane_b32 v21, s47, 16 -; GFX10_3-NEXT: v_writelane_b32 v21, s48, 17 -; GFX10_3-NEXT: v_writelane_b32 v21, s49, 18 -; GFX10_3-NEXT: v_writelane_b32 v21, s50, 19 -; GFX10_3-NEXT: v_writelane_b32 v21, s51, 20 -; GFX10_3-NEXT: v_writelane_b32 v21, s52, 21 -; GFX10_3-NEXT: v_writelane_b32 v21, s53, 22 -; GFX10_3-NEXT: v_writelane_b32 v21, s54, 23 -; GFX10_3-NEXT: v_writelane_b32 v21, s55, 24 -; GFX10_3-NEXT: v_writelane_b32 v21, s56, 25 -; GFX10_3-NEXT: v_writelane_b32 v21, s57, 26 -; GFX10_3-NEXT: v_writelane_b32 v21, s58, 27 +; GFX10_3-NEXT: v_writelane_b32 v21, s48, 9 +; GFX10_3-NEXT: v_writelane_b32 v21, s49, 10 +; GFX10_3-NEXT: v_writelane_b32 v21, s50, 11 +; GFX10_3-NEXT: v_writelane_b32 v21, s51, 12 +; GFX10_3-NEXT: v_writelane_b32 v21, s52, 13 +; GFX10_3-NEXT: v_writelane_b32 v21, s53, 14 +; GFX10_3-NEXT: v_writelane_b32 v21, s54, 15 +; GFX10_3-NEXT: v_writelane_b32 v21, s55, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: v_lshrrev_b32_e64 v22, 5, s32 -; GFX10_3-NEXT: v_writelane_b32 v21, s59, 28 -; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo ; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 16, v22 ; GFX10_3-NEXT: v_readfirstlane_b32 s59, v22 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v21, 28 -; GFX10_3-NEXT: v_readlane_b32 s58, v21, 27 -; GFX10_3-NEXT: v_readlane_b32 s57, v21, 26 -; GFX10_3-NEXT: v_readlane_b32 s56, v21, 25 -; GFX10_3-NEXT: v_readlane_b32 s55, v21, 24 -; GFX10_3-NEXT: v_readlane_b32 s54, v21, 23 -; GFX10_3-NEXT: v_readlane_b32 s53, v21, 22 -; GFX10_3-NEXT: v_readlane_b32 s52, v21, 21 -; GFX10_3-NEXT: v_readlane_b32 s51, v21, 20 -; GFX10_3-NEXT: v_readlane_b32 s50, v21, 19 -; GFX10_3-NEXT: v_readlane_b32 s49, v21, 18 -; GFX10_3-NEXT: v_readlane_b32 s48, v21, 17 -; GFX10_3-NEXT: v_readlane_b32 s47, v21, 16 -; GFX10_3-NEXT: v_readlane_b32 s46, v21, 15 -; GFX10_3-NEXT: v_readlane_b32 s45, v21, 14 -; GFX10_3-NEXT: v_readlane_b32 s44, v21, 13 -; GFX10_3-NEXT: v_readlane_b32 s43, v21, 12 -; GFX10_3-NEXT: v_readlane_b32 s42, v21, 11 -; GFX10_3-NEXT: v_readlane_b32 s41, v21, 10 -; GFX10_3-NEXT: v_readlane_b32 s40, v21, 9 +; GFX10_3-NEXT: v_readlane_b32 s55, v21, 16 +; GFX10_3-NEXT: v_readlane_b32 s54, v21, 15 +; GFX10_3-NEXT: v_readlane_b32 s53, v21, 14 +; GFX10_3-NEXT: v_readlane_b32 s52, v21, 13 +; GFX10_3-NEXT: v_readlane_b32 s51, v21, 12 +; GFX10_3-NEXT: v_readlane_b32 s50, v21, 11 +; GFX10_3-NEXT: v_readlane_b32 s49, v21, 10 +; GFX10_3-NEXT: v_readlane_b32 s48, v21, 9 ; GFX10_3-NEXT: v_readlane_b32 s39, v21, 8 ; GFX10_3-NEXT: v_readlane_b32 s38, v21, 7 ; GFX10_3-NEXT: v_readlane_b32 s37, v21, 6 @@ -1310,6 +958,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX11-NEXT: scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v21, s30, 0 +; GFX11-NEXT: s_and_b32 s59, 0, exec_lo ; GFX11-NEXT: v_writelane_b32 v21, s31, 1 ; GFX11-NEXT: v_writelane_b32 v21, s33, 2 ; GFX11-NEXT: v_writelane_b32 v21, s34, 3 @@ -1318,62 +967,33 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX11-NEXT: v_writelane_b32 v21, s37, 6 ; GFX11-NEXT: v_writelane_b32 v21, s38, 7 ; GFX11-NEXT: v_writelane_b32 v21, s39, 8 -; GFX11-NEXT: v_writelane_b32 v21, s40, 9 -; GFX11-NEXT: v_writelane_b32 v21, s41, 10 -; GFX11-NEXT: v_writelane_b32 v21, s42, 11 -; GFX11-NEXT: v_writelane_b32 v21, s43, 12 -; GFX11-NEXT: v_writelane_b32 v21, s44, 13 -; GFX11-NEXT: v_writelane_b32 v21, s45, 14 -; GFX11-NEXT: v_writelane_b32 v21, s46, 15 -; GFX11-NEXT: v_writelane_b32 v21, s47, 16 -; GFX11-NEXT: v_writelane_b32 v21, s48, 17 -; GFX11-NEXT: v_writelane_b32 v21, s49, 18 -; GFX11-NEXT: v_writelane_b32 v21, s50, 19 -; GFX11-NEXT: v_writelane_b32 v21, s51, 20 -; GFX11-NEXT: v_writelane_b32 v21, s52, 21 -; GFX11-NEXT: v_writelane_b32 v21, s53, 22 -; GFX11-NEXT: v_writelane_b32 v21, s54, 23 -; GFX11-NEXT: v_writelane_b32 v21, s55, 24 -; GFX11-NEXT: v_writelane_b32 v21, s56, 25 -; GFX11-NEXT: v_writelane_b32 v21, s57, 26 -; GFX11-NEXT: v_writelane_b32 v21, s58, 27 +; GFX11-NEXT: v_writelane_b32 v21, s48, 9 +; GFX11-NEXT: v_writelane_b32 v21, s49, 10 +; GFX11-NEXT: v_writelane_b32 v21, s50, 11 +; GFX11-NEXT: v_writelane_b32 v21, s51, 12 +; GFX11-NEXT: v_writelane_b32 v21, s52, 13 +; GFX11-NEXT: v_writelane_b32 v21, s53, 14 +; GFX11-NEXT: v_writelane_b32 v21, s54, 15 +; GFX11-NEXT: v_writelane_b32 v21, s55, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v21, s59, 28 -; GFX11-NEXT: s_and_b32 s59, 0, exec_lo -; GFX11-NEXT: s_addc_u32 s32, s32, 16 +; GFX11-NEXT: s_addc_u32 s60, s32, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 -; GFX11-NEXT: s_mov_b32 s59, s32 -; GFX11-NEXT: s_addc_u32 s32, s32, -16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 +; GFX11-NEXT: s_bitcmp1_b32 s60, 0 +; GFX11-NEXT: s_bitset0_b32 s60, 0 +; GFX11-NEXT: s_mov_b32 s59, s60 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v21, 28 -; GFX11-NEXT: v_readlane_b32 s58, v21, 27 -; GFX11-NEXT: v_readlane_b32 s57, v21, 26 -; GFX11-NEXT: v_readlane_b32 s56, v21, 25 -; GFX11-NEXT: v_readlane_b32 s55, v21, 24 -; GFX11-NEXT: v_readlane_b32 s54, v21, 23 -; GFX11-NEXT: v_readlane_b32 s53, v21, 22 -; GFX11-NEXT: v_readlane_b32 s52, v21, 21 -; GFX11-NEXT: v_readlane_b32 s51, v21, 20 -; GFX11-NEXT: v_readlane_b32 s50, v21, 19 -; GFX11-NEXT: v_readlane_b32 s49, v21, 18 -; GFX11-NEXT: v_readlane_b32 s48, v21, 17 -; GFX11-NEXT: v_readlane_b32 s47, v21, 16 -; GFX11-NEXT: v_readlane_b32 s46, v21, 15 -; GFX11-NEXT: v_readlane_b32 s45, v21, 14 -; GFX11-NEXT: v_readlane_b32 s44, v21, 13 -; GFX11-NEXT: v_readlane_b32 s43, v21, 12 -; GFX11-NEXT: v_readlane_b32 s42, v21, 11 -; GFX11-NEXT: v_readlane_b32 s41, v21, 10 -; GFX11-NEXT: v_readlane_b32 s40, v21, 9 +; GFX11-NEXT: v_readlane_b32 s55, v21, 16 +; GFX11-NEXT: v_readlane_b32 s54, v21, 15 +; GFX11-NEXT: v_readlane_b32 s53, v21, 14 +; GFX11-NEXT: v_readlane_b32 s52, v21, 13 +; GFX11-NEXT: v_readlane_b32 s51, v21, 12 +; GFX11-NEXT: v_readlane_b32 s50, v21, 11 +; GFX11-NEXT: v_readlane_b32 s49, v21, 10 +; GFX11-NEXT: v_readlane_b32 s48, v21, 9 ; GFX11-NEXT: v_readlane_b32 s39, v21, 8 ; GFX11-NEXT: v_readlane_b32 s38, v21, 7 ; GFX11-NEXT: v_readlane_b32 s37, v21, 6 @@ -1402,6 +1022,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v21, s30, 0 +; GFX12-NEXT: s_and_b32 s59, 0, exec_lo ; GFX12-NEXT: v_writelane_b32 v21, s31, 1 ; GFX12-NEXT: v_writelane_b32 v21, s33, 2 ; GFX12-NEXT: v_writelane_b32 v21, s34, 3 @@ -1410,55 +1031,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: v_writelane_b32 v21, s37, 6 ; GFX12-NEXT: v_writelane_b32 v21, s38, 7 ; GFX12-NEXT: v_writelane_b32 v21, s39, 8 -; GFX12-NEXT: v_writelane_b32 v21, s40, 9 -; GFX12-NEXT: v_writelane_b32 v21, s41, 10 -; GFX12-NEXT: v_writelane_b32 v21, s42, 11 -; GFX12-NEXT: v_writelane_b32 v21, s43, 12 -; GFX12-NEXT: v_writelane_b32 v21, s44, 13 -; GFX12-NEXT: v_writelane_b32 v21, s45, 14 -; GFX12-NEXT: v_writelane_b32 v21, s46, 15 -; GFX12-NEXT: v_writelane_b32 v21, s47, 16 -; GFX12-NEXT: v_writelane_b32 v21, s48, 17 -; GFX12-NEXT: v_writelane_b32 v21, s49, 18 -; GFX12-NEXT: v_writelane_b32 v21, s50, 19 -; GFX12-NEXT: v_writelane_b32 v21, s51, 20 -; GFX12-NEXT: v_writelane_b32 v21, s52, 21 -; GFX12-NEXT: v_writelane_b32 v21, s53, 22 -; GFX12-NEXT: v_writelane_b32 v21, s54, 23 -; GFX12-NEXT: v_writelane_b32 v21, s55, 24 -; GFX12-NEXT: v_writelane_b32 v21, s56, 25 -; GFX12-NEXT: v_writelane_b32 v21, s57, 26 -; GFX12-NEXT: v_writelane_b32 v21, s58, 27 +; GFX12-NEXT: v_writelane_b32 v21, s48, 9 +; GFX12-NEXT: v_writelane_b32 v21, s49, 10 +; GFX12-NEXT: v_writelane_b32 v21, s50, 11 +; GFX12-NEXT: v_writelane_b32 v21, s51, 12 +; GFX12-NEXT: v_writelane_b32 v21, s52, 13 +; GFX12-NEXT: v_writelane_b32 v21, s53, 14 +; GFX12-NEXT: v_writelane_b32 v21, s54, 15 +; GFX12-NEXT: v_writelane_b32 v21, s55, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v21, s59, 28 -; GFX12-NEXT: s_and_b32 s59, 0, exec_lo ; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v21, 28 -; GFX12-NEXT: v_readlane_b32 s58, v21, 27 -; GFX12-NEXT: v_readlane_b32 s57, v21, 26 -; GFX12-NEXT: v_readlane_b32 s56, v21, 25 -; GFX12-NEXT: v_readlane_b32 s55, v21, 24 -; GFX12-NEXT: v_readlane_b32 s54, v21, 23 -; GFX12-NEXT: v_readlane_b32 s53, v21, 22 -; GFX12-NEXT: v_readlane_b32 s52, v21, 21 -; GFX12-NEXT: v_readlane_b32 s51, v21, 20 -; GFX12-NEXT: v_readlane_b32 s50, v21, 19 -; GFX12-NEXT: v_readlane_b32 s49, v21, 18 -; GFX12-NEXT: v_readlane_b32 s48, v21, 17 -; GFX12-NEXT: v_readlane_b32 s47, v21, 16 -; GFX12-NEXT: v_readlane_b32 s46, v21, 15 -; GFX12-NEXT: v_readlane_b32 s45, v21, 14 -; GFX12-NEXT: v_readlane_b32 s44, v21, 13 -; GFX12-NEXT: v_readlane_b32 s43, v21, 12 -; GFX12-NEXT: v_readlane_b32 s42, v21, 11 -; GFX12-NEXT: v_readlane_b32 s41, v21, 10 -; GFX12-NEXT: v_readlane_b32 s40, v21, 9 +; GFX12-NEXT: v_readlane_b32 s55, v21, 16 +; GFX12-NEXT: v_readlane_b32 s54, v21, 15 +; GFX12-NEXT: v_readlane_b32 s53, v21, 14 +; GFX12-NEXT: v_readlane_b32 s52, v21, 13 +; GFX12-NEXT: v_readlane_b32 s51, v21, 12 +; GFX12-NEXT: v_readlane_b32 s50, v21, 11 +; GFX12-NEXT: v_readlane_b32 s49, v21, 10 +; GFX12-NEXT: v_readlane_b32 s48, v21, 9 ; GFX12-NEXT: v_readlane_b32 s39, v21, 8 ; GFX12-NEXT: v_readlane_b32 s38, v21, 7 ; GFX12-NEXT: v_readlane_b32 s37, v21, 6 @@ -1523,8 +1119,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX7-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v23, s28, 28 -; GFX7-NEXT: v_writelane_b32 v23, s29, 29 +; GFX7-NEXT: v_writelane_b32 v23, s28, 17 +; GFX7-NEXT: v_writelane_b32 v23, s29, 18 ; GFX7-NEXT: v_writelane_b32 v23, s30, 0 ; GFX7-NEXT: v_writelane_b32 v23, s31, 1 ; GFX7-NEXT: v_writelane_b32 v23, s33, 2 @@ -1534,32 +1130,21 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_writelane_b32 v23, s37, 6 ; GFX7-NEXT: v_writelane_b32 v23, s38, 7 ; GFX7-NEXT: v_writelane_b32 v23, s39, 8 -; GFX7-NEXT: v_writelane_b32 v23, s40, 9 -; GFX7-NEXT: v_writelane_b32 v23, s41, 10 -; GFX7-NEXT: v_writelane_b32 v23, s42, 11 -; GFX7-NEXT: v_writelane_b32 v23, s43, 12 -; GFX7-NEXT: v_writelane_b32 v23, s44, 13 -; GFX7-NEXT: v_writelane_b32 v23, s45, 14 -; GFX7-NEXT: v_writelane_b32 v23, s46, 15 -; GFX7-NEXT: v_writelane_b32 v23, s47, 16 -; GFX7-NEXT: v_writelane_b32 v23, s48, 17 -; GFX7-NEXT: v_writelane_b32 v23, s49, 18 -; GFX7-NEXT: v_writelane_b32 v23, s50, 19 -; GFX7-NEXT: v_writelane_b32 v23, s51, 20 -; GFX7-NEXT: v_writelane_b32 v23, s52, 21 -; GFX7-NEXT: v_writelane_b32 v23, s53, 22 -; GFX7-NEXT: v_writelane_b32 v23, s54, 23 -; GFX7-NEXT: v_writelane_b32 v23, s55, 24 +; GFX7-NEXT: v_writelane_b32 v23, s48, 9 +; GFX7-NEXT: v_writelane_b32 v23, s49, 10 +; GFX7-NEXT: v_writelane_b32 v23, s50, 11 +; GFX7-NEXT: v_writelane_b32 v23, s51, 12 +; GFX7-NEXT: v_writelane_b32 v23, s52, 13 ; GFX7-NEXT: s_lshr_b32 s5, s32, 6 -; GFX7-NEXT: v_writelane_b32 v23, s56, 25 +; GFX7-NEXT: v_writelane_b32 v23, s53, 14 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 ; GFX7-NEXT: s_add_i32 s4, s5, 0x4240 ; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX7-NEXT: v_writelane_b32 v23, s57, 26 +; GFX7-NEXT: v_writelane_b32 v23, s54, 15 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: v_writelane_b32 v22, s4, 0 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v23, s59, 27 +; GFX7-NEXT: v_writelane_b32 v23, s55, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use alloca0 v0 ; GFX7-NEXT: ;;#ASMEND @@ -1570,25 +1155,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s59, v23, 27 -; GFX7-NEXT: v_readlane_b32 s57, v23, 26 -; GFX7-NEXT: v_readlane_b32 s56, v23, 25 -; GFX7-NEXT: v_readlane_b32 s55, v23, 24 -; GFX7-NEXT: v_readlane_b32 s54, v23, 23 -; GFX7-NEXT: v_readlane_b32 s53, v23, 22 -; GFX7-NEXT: v_readlane_b32 s52, v23, 21 -; GFX7-NEXT: v_readlane_b32 s51, v23, 20 -; GFX7-NEXT: v_readlane_b32 s50, v23, 19 -; GFX7-NEXT: v_readlane_b32 s49, v23, 18 -; GFX7-NEXT: v_readlane_b32 s48, v23, 17 -; GFX7-NEXT: v_readlane_b32 s47, v23, 16 -; GFX7-NEXT: v_readlane_b32 s46, v23, 15 -; GFX7-NEXT: v_readlane_b32 s45, v23, 14 -; GFX7-NEXT: v_readlane_b32 s44, v23, 13 -; GFX7-NEXT: v_readlane_b32 s43, v23, 12 -; GFX7-NEXT: v_readlane_b32 s42, v23, 11 -; GFX7-NEXT: v_readlane_b32 s41, v23, 10 -; GFX7-NEXT: v_readlane_b32 s40, v23, 9 +; GFX7-NEXT: v_readlane_b32 s55, v23, 16 +; GFX7-NEXT: v_readlane_b32 s54, v23, 15 +; GFX7-NEXT: v_readlane_b32 s53, v23, 14 +; GFX7-NEXT: v_readlane_b32 s52, v23, 13 +; GFX7-NEXT: v_readlane_b32 s51, v23, 12 +; GFX7-NEXT: v_readlane_b32 s50, v23, 11 +; GFX7-NEXT: v_readlane_b32 s49, v23, 10 +; GFX7-NEXT: v_readlane_b32 s48, v23, 9 ; GFX7-NEXT: v_readlane_b32 s39, v23, 8 ; GFX7-NEXT: v_readlane_b32 s38, v23, 7 ; GFX7-NEXT: v_readlane_b32 s37, v23, 6 @@ -1598,8 +1172,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_readlane_b32 s33, v23, 2 ; GFX7-NEXT: v_readlane_b32 s31, v23, 1 ; GFX7-NEXT: v_readlane_b32 s30, v23, 0 -; GFX7-NEXT: v_readlane_b32 s28, v23, 28 -; GFX7-NEXT: v_readlane_b32 s29, v23, 29 +; GFX7-NEXT: v_readlane_b32 s28, v23, 17 +; GFX7-NEXT: v_readlane_b32 s29, v23, 18 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -1625,30 +1199,19 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: v_writelane_b32 v22, s37, 6 ; GFX8-NEXT: v_writelane_b32 v22, s38, 7 ; GFX8-NEXT: v_writelane_b32 v22, s39, 8 -; GFX8-NEXT: v_writelane_b32 v22, s40, 9 -; GFX8-NEXT: v_writelane_b32 v22, s41, 10 -; GFX8-NEXT: v_writelane_b32 v22, s42, 11 -; GFX8-NEXT: v_writelane_b32 v22, s43, 12 -; GFX8-NEXT: v_writelane_b32 v22, s44, 13 -; GFX8-NEXT: v_writelane_b32 v22, s45, 14 -; GFX8-NEXT: v_writelane_b32 v22, s46, 15 -; GFX8-NEXT: v_writelane_b32 v22, s47, 16 -; GFX8-NEXT: v_writelane_b32 v22, s48, 17 -; GFX8-NEXT: v_writelane_b32 v22, s49, 18 -; GFX8-NEXT: v_writelane_b32 v22, s50, 19 -; GFX8-NEXT: v_writelane_b32 v22, s51, 20 -; GFX8-NEXT: v_writelane_b32 v22, s52, 21 -; GFX8-NEXT: v_writelane_b32 v22, s53, 22 -; GFX8-NEXT: v_writelane_b32 v22, s54, 23 -; GFX8-NEXT: v_writelane_b32 v22, s55, 24 -; GFX8-NEXT: v_writelane_b32 v22, s56, 25 -; GFX8-NEXT: v_writelane_b32 v22, s57, 26 +; GFX8-NEXT: v_writelane_b32 v22, s48, 9 +; GFX8-NEXT: v_writelane_b32 v22, s49, 10 +; GFX8-NEXT: v_writelane_b32 v22, s50, 11 +; GFX8-NEXT: v_writelane_b32 v22, s51, 12 +; GFX8-NEXT: v_writelane_b32 v22, s52, 13 ; GFX8-NEXT: s_lshr_b32 s4, s32, 6 -; GFX8-NEXT: v_writelane_b32 v22, s59, 27 +; GFX8-NEXT: v_writelane_b32 v22, s53, 14 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_add_i32 s59, s4, 0x4240 +; GFX8-NEXT: v_writelane_b32 v22, s54, 15 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_writelane_b32 v22, s55, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND @@ -1658,25 +1221,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v22, 27 -; GFX8-NEXT: v_readlane_b32 s57, v22, 26 -; GFX8-NEXT: v_readlane_b32 s56, v22, 25 -; GFX8-NEXT: v_readlane_b32 s55, v22, 24 -; GFX8-NEXT: v_readlane_b32 s54, v22, 23 -; GFX8-NEXT: v_readlane_b32 s53, v22, 22 -; GFX8-NEXT: v_readlane_b32 s52, v22, 21 -; GFX8-NEXT: v_readlane_b32 s51, v22, 20 -; GFX8-NEXT: v_readlane_b32 s50, v22, 19 -; GFX8-NEXT: v_readlane_b32 s49, v22, 18 -; GFX8-NEXT: v_readlane_b32 s48, v22, 17 -; GFX8-NEXT: v_readlane_b32 s47, v22, 16 -; GFX8-NEXT: v_readlane_b32 s46, v22, 15 -; GFX8-NEXT: v_readlane_b32 s45, v22, 14 -; GFX8-NEXT: v_readlane_b32 s44, v22, 13 -; GFX8-NEXT: v_readlane_b32 s43, v22, 12 -; GFX8-NEXT: v_readlane_b32 s42, v22, 11 -; GFX8-NEXT: v_readlane_b32 s41, v22, 10 -; GFX8-NEXT: v_readlane_b32 s40, v22, 9 +; GFX8-NEXT: v_readlane_b32 s55, v22, 16 +; GFX8-NEXT: v_readlane_b32 s54, v22, 15 +; GFX8-NEXT: v_readlane_b32 s53, v22, 14 +; GFX8-NEXT: v_readlane_b32 s52, v22, 13 +; GFX8-NEXT: v_readlane_b32 s51, v22, 12 +; GFX8-NEXT: v_readlane_b32 s50, v22, 11 +; GFX8-NEXT: v_readlane_b32 s49, v22, 10 +; GFX8-NEXT: v_readlane_b32 s48, v22, 9 ; GFX8-NEXT: v_readlane_b32 s39, v22, 8 ; GFX8-NEXT: v_readlane_b32 s38, v22, 7 ; GFX8-NEXT: v_readlane_b32 s37, v22, 6 @@ -1709,30 +1261,19 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: v_writelane_b32 v22, s37, 6 ; GFX900-NEXT: v_writelane_b32 v22, s38, 7 ; GFX900-NEXT: v_writelane_b32 v22, s39, 8 -; GFX900-NEXT: v_writelane_b32 v22, s40, 9 -; GFX900-NEXT: v_writelane_b32 v22, s41, 10 -; GFX900-NEXT: v_writelane_b32 v22, s42, 11 -; GFX900-NEXT: v_writelane_b32 v22, s43, 12 -; GFX900-NEXT: v_writelane_b32 v22, s44, 13 -; GFX900-NEXT: v_writelane_b32 v22, s45, 14 -; GFX900-NEXT: v_writelane_b32 v22, s46, 15 -; GFX900-NEXT: v_writelane_b32 v22, s47, 16 -; GFX900-NEXT: v_writelane_b32 v22, s48, 17 -; GFX900-NEXT: v_writelane_b32 v22, s49, 18 -; GFX900-NEXT: v_writelane_b32 v22, s50, 19 -; GFX900-NEXT: v_writelane_b32 v22, s51, 20 -; GFX900-NEXT: v_writelane_b32 v22, s52, 21 -; GFX900-NEXT: v_writelane_b32 v22, s53, 22 -; GFX900-NEXT: v_writelane_b32 v22, s54, 23 -; GFX900-NEXT: v_writelane_b32 v22, s55, 24 -; GFX900-NEXT: v_writelane_b32 v22, s56, 25 -; GFX900-NEXT: v_writelane_b32 v22, s57, 26 +; GFX900-NEXT: v_writelane_b32 v22, s48, 9 +; GFX900-NEXT: v_writelane_b32 v22, s49, 10 +; GFX900-NEXT: v_writelane_b32 v22, s50, 11 +; GFX900-NEXT: v_writelane_b32 v22, s51, 12 +; GFX900-NEXT: v_writelane_b32 v22, s52, 13 ; GFX900-NEXT: s_lshr_b32 s4, s32, 6 -; GFX900-NEXT: v_writelane_b32 v22, s59, 27 +; GFX900-NEXT: v_writelane_b32 v22, s53, 14 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: s_add_i32 s59, s4, 0x4240 +; GFX900-NEXT: v_writelane_b32 v22, s54, 15 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_writelane_b32 v22, s55, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND @@ -1742,25 +1283,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v22, 27 -; GFX900-NEXT: v_readlane_b32 s57, v22, 26 -; GFX900-NEXT: v_readlane_b32 s56, v22, 25 -; GFX900-NEXT: v_readlane_b32 s55, v22, 24 -; GFX900-NEXT: v_readlane_b32 s54, v22, 23 -; GFX900-NEXT: v_readlane_b32 s53, v22, 22 -; GFX900-NEXT: v_readlane_b32 s52, v22, 21 -; GFX900-NEXT: v_readlane_b32 s51, v22, 20 -; GFX900-NEXT: v_readlane_b32 s50, v22, 19 -; GFX900-NEXT: v_readlane_b32 s49, v22, 18 -; GFX900-NEXT: v_readlane_b32 s48, v22, 17 -; GFX900-NEXT: v_readlane_b32 s47, v22, 16 -; GFX900-NEXT: v_readlane_b32 s46, v22, 15 -; GFX900-NEXT: v_readlane_b32 s45, v22, 14 -; GFX900-NEXT: v_readlane_b32 s44, v22, 13 -; GFX900-NEXT: v_readlane_b32 s43, v22, 12 -; GFX900-NEXT: v_readlane_b32 s42, v22, 11 -; GFX900-NEXT: v_readlane_b32 s41, v22, 10 -; GFX900-NEXT: v_readlane_b32 s40, v22, 9 +; GFX900-NEXT: v_readlane_b32 s55, v22, 16 +; GFX900-NEXT: v_readlane_b32 s54, v22, 15 +; GFX900-NEXT: v_readlane_b32 s53, v22, 14 +; GFX900-NEXT: v_readlane_b32 s52, v22, 13 +; GFX900-NEXT: v_readlane_b32 s51, v22, 12 +; GFX900-NEXT: v_readlane_b32 s50, v22, 11 +; GFX900-NEXT: v_readlane_b32 s49, v22, 10 +; GFX900-NEXT: v_readlane_b32 s48, v22, 9 ; GFX900-NEXT: v_readlane_b32 s39, v22, 8 ; GFX900-NEXT: v_readlane_b32 s38, v22, 7 ; GFX900-NEXT: v_readlane_b32 s37, v22, 6 @@ -1793,28 +1323,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX942-NEXT: v_writelane_b32 v22, s37, 6 ; GFX942-NEXT: v_writelane_b32 v22, s38, 7 ; GFX942-NEXT: v_writelane_b32 v22, s39, 8 -; GFX942-NEXT: v_writelane_b32 v22, s40, 9 -; GFX942-NEXT: v_writelane_b32 v22, s41, 10 -; GFX942-NEXT: v_writelane_b32 v22, s42, 11 -; GFX942-NEXT: v_writelane_b32 v22, s43, 12 -; GFX942-NEXT: v_writelane_b32 v22, s44, 13 -; GFX942-NEXT: v_writelane_b32 v22, s45, 14 -; GFX942-NEXT: v_writelane_b32 v22, s46, 15 -; GFX942-NEXT: v_writelane_b32 v22, s47, 16 -; GFX942-NEXT: v_writelane_b32 v22, s48, 17 -; GFX942-NEXT: v_writelane_b32 v22, s49, 18 -; GFX942-NEXT: v_writelane_b32 v22, s50, 19 -; GFX942-NEXT: v_writelane_b32 v22, s51, 20 -; GFX942-NEXT: v_writelane_b32 v22, s52, 21 -; GFX942-NEXT: v_writelane_b32 v22, s53, 22 -; GFX942-NEXT: v_writelane_b32 v22, s54, 23 -; GFX942-NEXT: v_writelane_b32 v22, s55, 24 -; GFX942-NEXT: v_writelane_b32 v22, s56, 25 -; GFX942-NEXT: v_writelane_b32 v22, s57, 26 +; GFX942-NEXT: v_writelane_b32 v22, s48, 9 +; GFX942-NEXT: v_writelane_b32 v22, s49, 10 +; GFX942-NEXT: v_writelane_b32 v22, s50, 11 +; GFX942-NEXT: v_writelane_b32 v22, s51, 12 +; GFX942-NEXT: v_writelane_b32 v22, s52, 13 +; GFX942-NEXT: v_writelane_b32 v22, s53, 14 ; GFX942-NEXT: s_add_i32 s0, s32, 64 -; GFX942-NEXT: v_writelane_b32 v22, s59, 27 +; GFX942-NEXT: v_writelane_b32 v22, s54, 15 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-NEXT: v_writelane_b32 v22, s60, 28 +; GFX942-NEXT: v_writelane_b32 v22, s55, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND @@ -1822,32 +1340,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_add_i32 s59, s32, 0x4240 -; GFX942-NEXT: v_writelane_b32 v22, s61, 29 ; GFX942-NEXT: s_and_b64 s[60:61], 0, exec ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s61, v22, 29 -; GFX942-NEXT: v_readlane_b32 s60, v22, 28 -; GFX942-NEXT: v_readlane_b32 s59, v22, 27 -; GFX942-NEXT: v_readlane_b32 s57, v22, 26 -; GFX942-NEXT: v_readlane_b32 s56, v22, 25 -; GFX942-NEXT: v_readlane_b32 s55, v22, 24 -; GFX942-NEXT: v_readlane_b32 s54, v22, 23 -; GFX942-NEXT: v_readlane_b32 s53, v22, 22 -; GFX942-NEXT: v_readlane_b32 s52, v22, 21 -; GFX942-NEXT: v_readlane_b32 s51, v22, 20 -; GFX942-NEXT: v_readlane_b32 s50, v22, 19 -; GFX942-NEXT: v_readlane_b32 s49, v22, 18 -; GFX942-NEXT: v_readlane_b32 s48, v22, 17 -; GFX942-NEXT: v_readlane_b32 s47, v22, 16 -; GFX942-NEXT: v_readlane_b32 s46, v22, 15 -; GFX942-NEXT: v_readlane_b32 s45, v22, 14 -; GFX942-NEXT: v_readlane_b32 s44, v22, 13 -; GFX942-NEXT: v_readlane_b32 s43, v22, 12 -; GFX942-NEXT: v_readlane_b32 s42, v22, 11 -; GFX942-NEXT: v_readlane_b32 s41, v22, 10 -; GFX942-NEXT: v_readlane_b32 s40, v22, 9 +; GFX942-NEXT: v_readlane_b32 s55, v22, 16 +; GFX942-NEXT: v_readlane_b32 s54, v22, 15 +; GFX942-NEXT: v_readlane_b32 s53, v22, 14 +; GFX942-NEXT: v_readlane_b32 s52, v22, 13 +; GFX942-NEXT: v_readlane_b32 s51, v22, 12 +; GFX942-NEXT: v_readlane_b32 s50, v22, 11 +; GFX942-NEXT: v_readlane_b32 s49, v22, 10 +; GFX942-NEXT: v_readlane_b32 s48, v22, 9 ; GFX942-NEXT: v_readlane_b32 s39, v22, 8 ; GFX942-NEXT: v_readlane_b32 s38, v22, 7 ; GFX942-NEXT: v_readlane_b32 s37, v22, 6 @@ -1875,8 +1379,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_1-NEXT: s_add_i32 s59, s4, 0x4240 ; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND @@ -1887,52 +1393,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: v_writelane_b32 v22, s37, 6 ; GFX10_1-NEXT: v_writelane_b32 v22, s38, 7 ; GFX10_1-NEXT: v_writelane_b32 v22, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v22, s40, 9 -; GFX10_1-NEXT: v_writelane_b32 v22, s41, 10 -; GFX10_1-NEXT: v_writelane_b32 v22, s42, 11 -; GFX10_1-NEXT: v_writelane_b32 v22, s43, 12 -; GFX10_1-NEXT: v_writelane_b32 v22, s44, 13 -; GFX10_1-NEXT: v_writelane_b32 v22, s45, 14 -; GFX10_1-NEXT: v_writelane_b32 v22, s46, 15 -; GFX10_1-NEXT: v_writelane_b32 v22, s47, 16 -; GFX10_1-NEXT: v_writelane_b32 v22, s48, 17 -; GFX10_1-NEXT: v_writelane_b32 v22, s49, 18 -; GFX10_1-NEXT: v_writelane_b32 v22, s50, 19 -; GFX10_1-NEXT: v_writelane_b32 v22, s51, 20 -; GFX10_1-NEXT: v_writelane_b32 v22, s52, 21 -; GFX10_1-NEXT: v_writelane_b32 v22, s53, 22 -; GFX10_1-NEXT: v_writelane_b32 v22, s54, 23 -; GFX10_1-NEXT: v_writelane_b32 v22, s55, 24 -; GFX10_1-NEXT: v_writelane_b32 v22, s56, 25 -; GFX10_1-NEXT: v_writelane_b32 v22, s57, 26 -; GFX10_1-NEXT: v_writelane_b32 v22, s59, 27 -; GFX10_1-NEXT: s_add_i32 s59, s4, 0x4240 -; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: v_writelane_b32 v22, s48, 9 +; GFX10_1-NEXT: v_writelane_b32 v22, s49, 10 +; GFX10_1-NEXT: v_writelane_b32 v22, s50, 11 +; GFX10_1-NEXT: v_writelane_b32 v22, s51, 12 +; GFX10_1-NEXT: v_writelane_b32 v22, s52, 13 +; GFX10_1-NEXT: v_writelane_b32 v22, s53, 14 +; GFX10_1-NEXT: v_writelane_b32 v22, s54, 15 +; GFX10_1-NEXT: v_writelane_b32 v22, s55, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v22, 27 -; GFX10_1-NEXT: v_readlane_b32 s57, v22, 26 -; GFX10_1-NEXT: v_readlane_b32 s56, v22, 25 -; GFX10_1-NEXT: v_readlane_b32 s55, v22, 24 -; GFX10_1-NEXT: v_readlane_b32 s54, v22, 23 -; GFX10_1-NEXT: v_readlane_b32 s53, v22, 22 -; GFX10_1-NEXT: v_readlane_b32 s52, v22, 21 -; GFX10_1-NEXT: v_readlane_b32 s51, v22, 20 -; GFX10_1-NEXT: v_readlane_b32 s50, v22, 19 -; GFX10_1-NEXT: v_readlane_b32 s49, v22, 18 -; GFX10_1-NEXT: v_readlane_b32 s48, v22, 17 -; GFX10_1-NEXT: v_readlane_b32 s47, v22, 16 -; GFX10_1-NEXT: v_readlane_b32 s46, v22, 15 -; GFX10_1-NEXT: v_readlane_b32 s45, v22, 14 -; GFX10_1-NEXT: v_readlane_b32 s44, v22, 13 -; GFX10_1-NEXT: v_readlane_b32 s43, v22, 12 -; GFX10_1-NEXT: v_readlane_b32 s42, v22, 11 -; GFX10_1-NEXT: v_readlane_b32 s41, v22, 10 -; GFX10_1-NEXT: v_readlane_b32 s40, v22, 9 +; GFX10_1-NEXT: v_readlane_b32 s55, v22, 16 +; GFX10_1-NEXT: v_readlane_b32 s54, v22, 15 +; GFX10_1-NEXT: v_readlane_b32 s53, v22, 14 +; GFX10_1-NEXT: v_readlane_b32 s52, v22, 13 +; GFX10_1-NEXT: v_readlane_b32 s51, v22, 12 +; GFX10_1-NEXT: v_readlane_b32 s50, v22, 11 +; GFX10_1-NEXT: v_readlane_b32 s49, v22, 10 +; GFX10_1-NEXT: v_readlane_b32 s48, v22, 9 ; GFX10_1-NEXT: v_readlane_b32 s39, v22, 8 ; GFX10_1-NEXT: v_readlane_b32 s38, v22, 7 ; GFX10_1-NEXT: v_readlane_b32 s37, v22, 6 @@ -1960,8 +1442,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_3-NEXT: s_add_i32 s59, s4, 0x4240 ; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND @@ -1972,52 +1456,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: v_writelane_b32 v22, s37, 6 ; GFX10_3-NEXT: v_writelane_b32 v22, s38, 7 ; GFX10_3-NEXT: v_writelane_b32 v22, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v22, s40, 9 -; GFX10_3-NEXT: v_writelane_b32 v22, s41, 10 -; GFX10_3-NEXT: v_writelane_b32 v22, s42, 11 -; GFX10_3-NEXT: v_writelane_b32 v22, s43, 12 -; GFX10_3-NEXT: v_writelane_b32 v22, s44, 13 -; GFX10_3-NEXT: v_writelane_b32 v22, s45, 14 -; GFX10_3-NEXT: v_writelane_b32 v22, s46, 15 -; GFX10_3-NEXT: v_writelane_b32 v22, s47, 16 -; GFX10_3-NEXT: v_writelane_b32 v22, s48, 17 -; GFX10_3-NEXT: v_writelane_b32 v22, s49, 18 -; GFX10_3-NEXT: v_writelane_b32 v22, s50, 19 -; GFX10_3-NEXT: v_writelane_b32 v22, s51, 20 -; GFX10_3-NEXT: v_writelane_b32 v22, s52, 21 -; GFX10_3-NEXT: v_writelane_b32 v22, s53, 22 -; GFX10_3-NEXT: v_writelane_b32 v22, s54, 23 -; GFX10_3-NEXT: v_writelane_b32 v22, s55, 24 -; GFX10_3-NEXT: v_writelane_b32 v22, s56, 25 -; GFX10_3-NEXT: v_writelane_b32 v22, s57, 26 -; GFX10_3-NEXT: v_writelane_b32 v22, s59, 27 -; GFX10_3-NEXT: s_add_i32 s59, s4, 0x4240 -; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: v_writelane_b32 v22, s48, 9 +; GFX10_3-NEXT: v_writelane_b32 v22, s49, 10 +; GFX10_3-NEXT: v_writelane_b32 v22, s50, 11 +; GFX10_3-NEXT: v_writelane_b32 v22, s51, 12 +; GFX10_3-NEXT: v_writelane_b32 v22, s52, 13 +; GFX10_3-NEXT: v_writelane_b32 v22, s53, 14 +; GFX10_3-NEXT: v_writelane_b32 v22, s54, 15 +; GFX10_3-NEXT: v_writelane_b32 v22, s55, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v22, 27 -; GFX10_3-NEXT: v_readlane_b32 s57, v22, 26 -; GFX10_3-NEXT: v_readlane_b32 s56, v22, 25 -; GFX10_3-NEXT: v_readlane_b32 s55, v22, 24 -; GFX10_3-NEXT: v_readlane_b32 s54, v22, 23 -; GFX10_3-NEXT: v_readlane_b32 s53, v22, 22 -; GFX10_3-NEXT: v_readlane_b32 s52, v22, 21 -; GFX10_3-NEXT: v_readlane_b32 s51, v22, 20 -; GFX10_3-NEXT: v_readlane_b32 s50, v22, 19 -; GFX10_3-NEXT: v_readlane_b32 s49, v22, 18 -; GFX10_3-NEXT: v_readlane_b32 s48, v22, 17 -; GFX10_3-NEXT: v_readlane_b32 s47, v22, 16 -; GFX10_3-NEXT: v_readlane_b32 s46, v22, 15 -; GFX10_3-NEXT: v_readlane_b32 s45, v22, 14 -; GFX10_3-NEXT: v_readlane_b32 s44, v22, 13 -; GFX10_3-NEXT: v_readlane_b32 s43, v22, 12 -; GFX10_3-NEXT: v_readlane_b32 s42, v22, 11 -; GFX10_3-NEXT: v_readlane_b32 s41, v22, 10 -; GFX10_3-NEXT: v_readlane_b32 s40, v22, 9 +; GFX10_3-NEXT: v_readlane_b32 s55, v22, 16 +; GFX10_3-NEXT: v_readlane_b32 s54, v22, 15 +; GFX10_3-NEXT: v_readlane_b32 s53, v22, 14 +; GFX10_3-NEXT: v_readlane_b32 s52, v22, 13 +; GFX10_3-NEXT: v_readlane_b32 s51, v22, 12 +; GFX10_3-NEXT: v_readlane_b32 s50, v22, 11 +; GFX10_3-NEXT: v_readlane_b32 s49, v22, 10 +; GFX10_3-NEXT: v_readlane_b32 s48, v22, 9 ; GFX10_3-NEXT: v_readlane_b32 s39, v22, 8 ; GFX10_3-NEXT: v_readlane_b32 s38, v22, 7 ; GFX10_3-NEXT: v_readlane_b32 s37, v22, 6 @@ -2043,12 +1503,13 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v22, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s59, s32, 0x4240 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_and_b32 s0, 0, exec_lo +; GFX11-NEXT: v_writelane_b32 v22, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v22, s31, 1 ; GFX11-NEXT: v_writelane_b32 v22, s33, 2 ; GFX11-NEXT: v_writelane_b32 v22, s34, 3 ; GFX11-NEXT: v_writelane_b32 v22, s35, 4 @@ -2056,52 +1517,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: v_writelane_b32 v22, s37, 6 ; GFX11-NEXT: v_writelane_b32 v22, s38, 7 ; GFX11-NEXT: v_writelane_b32 v22, s39, 8 -; GFX11-NEXT: v_writelane_b32 v22, s40, 9 -; GFX11-NEXT: v_writelane_b32 v22, s41, 10 -; GFX11-NEXT: v_writelane_b32 v22, s42, 11 -; GFX11-NEXT: v_writelane_b32 v22, s43, 12 -; GFX11-NEXT: v_writelane_b32 v22, s44, 13 -; GFX11-NEXT: v_writelane_b32 v22, s45, 14 -; GFX11-NEXT: v_writelane_b32 v22, s46, 15 -; GFX11-NEXT: v_writelane_b32 v22, s47, 16 -; GFX11-NEXT: v_writelane_b32 v22, s48, 17 -; GFX11-NEXT: v_writelane_b32 v22, s49, 18 -; GFX11-NEXT: v_writelane_b32 v22, s50, 19 -; GFX11-NEXT: v_writelane_b32 v22, s51, 20 -; GFX11-NEXT: v_writelane_b32 v22, s52, 21 -; GFX11-NEXT: v_writelane_b32 v22, s53, 22 -; GFX11-NEXT: v_writelane_b32 v22, s54, 23 -; GFX11-NEXT: v_writelane_b32 v22, s55, 24 -; GFX11-NEXT: v_writelane_b32 v22, s56, 25 -; GFX11-NEXT: v_writelane_b32 v22, s57, 26 -; GFX11-NEXT: v_writelane_b32 v22, s59, 27 -; GFX11-NEXT: s_add_i32 s59, s32, 0x4240 -; GFX11-NEXT: s_and_b32 s0, 0, exec_lo +; GFX11-NEXT: v_writelane_b32 v22, s48, 9 +; GFX11-NEXT: v_writelane_b32 v22, s49, 10 +; GFX11-NEXT: v_writelane_b32 v22, s50, 11 +; GFX11-NEXT: v_writelane_b32 v22, s51, 12 +; GFX11-NEXT: v_writelane_b32 v22, s52, 13 +; GFX11-NEXT: v_writelane_b32 v22, s53, 14 +; GFX11-NEXT: v_writelane_b32 v22, s54, 15 +; GFX11-NEXT: v_writelane_b32 v22, s55, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v22, 27 -; GFX11-NEXT: v_readlane_b32 s57, v22, 26 -; GFX11-NEXT: v_readlane_b32 s56, v22, 25 -; GFX11-NEXT: v_readlane_b32 s55, v22, 24 -; GFX11-NEXT: v_readlane_b32 s54, v22, 23 -; GFX11-NEXT: v_readlane_b32 s53, v22, 22 -; GFX11-NEXT: v_readlane_b32 s52, v22, 21 -; GFX11-NEXT: v_readlane_b32 s51, v22, 20 -; GFX11-NEXT: v_readlane_b32 s50, v22, 19 -; GFX11-NEXT: v_readlane_b32 s49, v22, 18 -; GFX11-NEXT: v_readlane_b32 s48, v22, 17 -; GFX11-NEXT: v_readlane_b32 s47, v22, 16 -; GFX11-NEXT: v_readlane_b32 s46, v22, 15 -; GFX11-NEXT: v_readlane_b32 s45, v22, 14 -; GFX11-NEXT: v_readlane_b32 s44, v22, 13 -; GFX11-NEXT: v_readlane_b32 s43, v22, 12 -; GFX11-NEXT: v_readlane_b32 s42, v22, 11 -; GFX11-NEXT: v_readlane_b32 s41, v22, 10 -; GFX11-NEXT: v_readlane_b32 s40, v22, 9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s55, v22, 16 +; GFX11-NEXT: v_readlane_b32 s54, v22, 15 +; GFX11-NEXT: v_readlane_b32 s53, v22, 14 +; GFX11-NEXT: v_readlane_b32 s52, v22, 13 +; GFX11-NEXT: v_readlane_b32 s51, v22, 12 +; GFX11-NEXT: v_readlane_b32 s50, v22, 11 +; GFX11-NEXT: v_readlane_b32 s49, v22, 10 +; GFX11-NEXT: v_readlane_b32 s48, v22, 9 ; GFX11-NEXT: v_readlane_b32 s39, v22, 8 ; GFX11-NEXT: v_readlane_b32 s38, v22, 7 ; GFX11-NEXT: v_readlane_b32 s37, v22, 6 @@ -2130,7 +1568,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v22, s30, 0 +; GFX12-NEXT: s_add_co_i32 s59, s32, 0x4200 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 +; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND @@ -2142,52 +1582,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: v_writelane_b32 v22, s37, 6 ; GFX12-NEXT: v_writelane_b32 v22, s38, 7 ; GFX12-NEXT: v_writelane_b32 v22, s39, 8 -; GFX12-NEXT: v_writelane_b32 v22, s40, 9 -; GFX12-NEXT: v_writelane_b32 v22, s41, 10 -; GFX12-NEXT: v_writelane_b32 v22, s42, 11 -; GFX12-NEXT: v_writelane_b32 v22, s43, 12 -; GFX12-NEXT: v_writelane_b32 v22, s44, 13 -; GFX12-NEXT: v_writelane_b32 v22, s45, 14 -; GFX12-NEXT: v_writelane_b32 v22, s46, 15 -; GFX12-NEXT: v_writelane_b32 v22, s47, 16 -; GFX12-NEXT: v_writelane_b32 v22, s48, 17 -; GFX12-NEXT: v_writelane_b32 v22, s49, 18 -; GFX12-NEXT: v_writelane_b32 v22, s50, 19 -; GFX12-NEXT: v_writelane_b32 v22, s51, 20 -; GFX12-NEXT: v_writelane_b32 v22, s52, 21 -; GFX12-NEXT: v_writelane_b32 v22, s53, 22 -; GFX12-NEXT: v_writelane_b32 v22, s54, 23 -; GFX12-NEXT: v_writelane_b32 v22, s55, 24 -; GFX12-NEXT: v_writelane_b32 v22, s56, 25 -; GFX12-NEXT: v_writelane_b32 v22, s57, 26 -; GFX12-NEXT: v_writelane_b32 v22, s59, 27 -; GFX12-NEXT: s_add_co_i32 s59, s32, 0x4200 -; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: v_writelane_b32 v22, s48, 9 +; GFX12-NEXT: v_writelane_b32 v22, s49, 10 +; GFX12-NEXT: v_writelane_b32 v22, s50, 11 +; GFX12-NEXT: v_writelane_b32 v22, s51, 12 +; GFX12-NEXT: v_writelane_b32 v22, s52, 13 +; GFX12-NEXT: v_writelane_b32 v22, s53, 14 +; GFX12-NEXT: v_writelane_b32 v22, s54, 15 +; GFX12-NEXT: v_writelane_b32 v22, s55, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v22, 27 -; GFX12-NEXT: v_readlane_b32 s57, v22, 26 -; GFX12-NEXT: v_readlane_b32 s56, v22, 25 -; GFX12-NEXT: v_readlane_b32 s55, v22, 24 -; GFX12-NEXT: v_readlane_b32 s54, v22, 23 -; GFX12-NEXT: v_readlane_b32 s53, v22, 22 -; GFX12-NEXT: v_readlane_b32 s52, v22, 21 -; GFX12-NEXT: v_readlane_b32 s51, v22, 20 -; GFX12-NEXT: v_readlane_b32 s50, v22, 19 -; GFX12-NEXT: v_readlane_b32 s49, v22, 18 -; GFX12-NEXT: v_readlane_b32 s48, v22, 17 -; GFX12-NEXT: v_readlane_b32 s47, v22, 16 -; GFX12-NEXT: v_readlane_b32 s46, v22, 15 -; GFX12-NEXT: v_readlane_b32 s45, v22, 14 -; GFX12-NEXT: v_readlane_b32 s44, v22, 13 -; GFX12-NEXT: v_readlane_b32 s43, v22, 12 -; GFX12-NEXT: v_readlane_b32 s42, v22, 11 -; GFX12-NEXT: v_readlane_b32 s41, v22, 10 -; GFX12-NEXT: v_readlane_b32 s40, v22, 9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s55, v22, 16 +; GFX12-NEXT: v_readlane_b32 s54, v22, 15 +; GFX12-NEXT: v_readlane_b32 s53, v22, 14 +; GFX12-NEXT: v_readlane_b32 s52, v22, 13 +; GFX12-NEXT: v_readlane_b32 s51, v22, 12 +; GFX12-NEXT: v_readlane_b32 s50, v22, 11 +; GFX12-NEXT: v_readlane_b32 s49, v22, 10 +; GFX12-NEXT: v_readlane_b32 s48, v22, 9 ; GFX12-NEXT: v_readlane_b32 s39, v22, 8 ; GFX12-NEXT: v_readlane_b32 s38, v22, 7 ; GFX12-NEXT: v_readlane_b32 s37, v22, 6 diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll index 0112453e32bfc..52f380b7f80a3 100644 --- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll +++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: I_Quit: ; CHECK: .set I_Quit.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set I_Quit.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set I_Quit.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set I_Quit.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set I_Quit.private_seg_size, 16 ; CHECK: .set I_Quit.uses_vcc, 1 ; CHECK: .set I_Quit.uses_flat_scratch, 1 @@ -80,7 +80,7 @@ define void @P_SetThingPosition() { ; CHECK-LABEL: P_SetupPsprites: ; CHECK: .set P_SetupPsprites.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set P_SetupPsprites.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set P_SetupPsprites.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set P_SetupPsprites.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set P_SetupPsprites.private_seg_size, 16 ; CHECK: .set P_SetupPsprites.uses_vcc, 1 ; CHECK: .set P_SetupPsprites.uses_flat_scratch, 1 @@ -110,7 +110,7 @@ define void @HU_Start() { ; CHECK-LABEL: P_SpawnPlayer: ; CHECK: .set P_SpawnPlayer.num_vgpr, max(43, G_PlayerReborn.num_vgpr, P_SetThingPosition.num_vgpr, P_SetupPsprites.num_vgpr, HU_Start.num_vgpr) ; CHECK: .set P_SpawnPlayer.num_agpr, max(0, G_PlayerReborn.num_agpr, P_SetThingPosition.num_agpr, P_SetupPsprites.num_agpr, HU_Start.num_agpr) -; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(60, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr) +; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(84, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr) ; CHECK: .set P_SpawnPlayer.private_seg_size, 16+(max(G_PlayerReborn.private_seg_size, P_SetThingPosition.private_seg_size, P_SetupPsprites.private_seg_size, HU_Start.private_seg_size)) ; CHECK: .set P_SpawnPlayer.uses_vcc, or(1, G_PlayerReborn.uses_vcc, P_SetThingPosition.uses_vcc, P_SetupPsprites.uses_vcc, HU_Start.uses_vcc) ; CHECK: .set P_SpawnPlayer.uses_flat_scratch, or(0, G_PlayerReborn.uses_flat_scratch, P_SetThingPosition.uses_flat_scratch, P_SetupPsprites.uses_flat_scratch, HU_Start.uses_flat_scratch) @@ -128,7 +128,7 @@ define void @P_SpawnPlayer() { ; CHECK-LABEL: I_Error: ; CHECK: .set I_Error.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set I_Error.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set I_Error.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set I_Error.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set I_Error.private_seg_size, 16 ; CHECK: .set I_Error.uses_vcc, 1 ; CHECK: .set I_Error.uses_flat_scratch, 1 @@ -144,7 +144,7 @@ define void @I_Error(...) { ; CHECK-LABEL: G_DoReborn: ; CHECK: .set G_DoReborn.num_vgpr, max(44, P_RemoveMobj.num_vgpr, P_SpawnMobj.num_vgpr, P_SpawnPlayer.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_DoReborn.num_agpr, max(0, P_RemoveMobj.num_agpr, P_SpawnMobj.num_agpr, P_SpawnPlayer.num_agpr, I_Error.num_agpr) -; CHECK: .set G_DoReborn.numbered_sgpr, max(72, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_DoReborn.numbered_sgpr, max(104, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr) ; CHECK: .set G_DoReborn.private_seg_size, 32+(max(P_RemoveMobj.private_seg_size, P_SpawnMobj.private_seg_size, P_SpawnPlayer.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_DoReborn.uses_vcc, or(1, P_RemoveMobj.uses_vcc, P_SpawnMobj.uses_vcc, P_SpawnPlayer.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_DoReborn.uses_flat_scratch, or(0, P_RemoveMobj.uses_flat_scratch, P_SpawnMobj.uses_flat_scratch, P_SpawnPlayer.uses_flat_scratch, I_Error.uses_flat_scratch) @@ -218,7 +218,7 @@ define void @F_Ticker() { ; CHECK-LABEL: G_CheckDemoStatus: ; CHECK: .set G_CheckDemoStatus.num_vgpr, max(43, I_Quit.num_vgpr, D_AdvanceDemo.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_CheckDemoStatus.num_agpr, max(0, I_Quit.num_agpr, D_AdvanceDemo.num_agpr, I_Error.num_agpr) -; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(60, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(84, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr) ; CHECK: .set G_CheckDemoStatus.private_seg_size, 32+(max(I_Quit.private_seg_size, D_AdvanceDemo.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_CheckDemoStatus.uses_vcc, or(1, I_Quit.uses_vcc, D_AdvanceDemo.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_CheckDemoStatus.uses_flat_scratch, or(0, I_Quit.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, I_Error.uses_flat_scratch) @@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() { ; CHECK-LABEL: R_FlatNumForName: ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr) ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr) -; CHECK: .set R_FlatNumForName.numbered_sgpr, max(48, I_Error.numbered_sgpr) +; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr) ; CHECK: .set R_FlatNumForName.private_seg_size, 16+(max(I_Error.private_seg_size)) ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc) ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch) @@ -279,7 +279,7 @@ define i32 @R_FlatNumForName() { ; CHECK-LABEL: R_TextureNumForName: ; CHECK: .set R_TextureNumForName.num_vgpr, max(42, R_FlatNumForName.num_vgpr) ; CHECK: .set R_TextureNumForName.num_agpr, max(0, R_FlatNumForName.num_agpr) -; CHECK: .set R_TextureNumForName.numbered_sgpr, max(48, R_FlatNumForName.numbered_sgpr) +; CHECK: .set R_TextureNumForName.numbered_sgpr, max(56, R_FlatNumForName.numbered_sgpr) ; CHECK: .set R_TextureNumForName.private_seg_size, 16+(max(R_FlatNumForName.private_seg_size)) ; CHECK: .set R_TextureNumForName.uses_vcc, or(1, R_FlatNumForName.uses_vcc) ; CHECK: .set R_TextureNumForName.uses_flat_scratch, or(0, R_FlatNumForName.uses_flat_scratch) @@ -292,10 +292,10 @@ define i32 @R_TextureNumForName() { } ; CHECK-LABEL: G_Ticker: -; CHECK: .set G_Ticker.num_vgpr, max(46, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr) +; CHECK: .set G_Ticker.num_vgpr, max(47, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_Ticker.num_agpr, max(0, G_DoReborn.num_agpr, F_Ticker.num_agpr, AM_Stop.num_agpr, F_StartFinale.num_agpr, D_AdvanceDemo.num_agpr, R_FlatNumForName.num_agpr, R_TextureNumForName.num_agpr, P_TempSaveGameFile.num_agpr, P_SaveGameFile.num_agpr, I_Error.num_agpr) -; CHECK: .set G_Ticker.numbered_sgpr, max(84, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr) -; CHECK: .set G_Ticker.private_seg_size, 32+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size)) +; CHECK: .set G_Ticker.numbered_sgpr, max(105, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_Ticker.private_seg_size, 48+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_Ticker.uses_vcc, or(1, G_DoReborn.uses_vcc, F_Ticker.uses_vcc, AM_Stop.uses_vcc, F_StartFinale.uses_vcc, D_AdvanceDemo.uses_vcc, R_FlatNumForName.uses_vcc, R_TextureNumForName.uses_vcc, P_TempSaveGameFile.uses_vcc, P_SaveGameFile.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_Ticker.uses_flat_scratch, or(0, G_DoReborn.uses_flat_scratch, F_Ticker.uses_flat_scratch, AM_Stop.uses_flat_scratch, F_StartFinale.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, R_FlatNumForName.uses_flat_scratch, R_TextureNumForName.uses_flat_scratch, P_TempSaveGameFile.uses_flat_scratch, P_SaveGameFile.uses_flat_scratch, I_Error.uses_flat_scratch) ; CHECK: .set G_Ticker.has_dyn_sized_stack, or(0, G_DoReborn.has_dyn_sized_stack, F_Ticker.has_dyn_sized_stack, AM_Stop.has_dyn_sized_stack, F_StartFinale.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, R_FlatNumForName.has_dyn_sized_stack, R_TextureNumForName.has_dyn_sized_stack, P_TempSaveGameFile.has_dyn_sized_stack, P_SaveGameFile.has_dyn_sized_stack, I_Error.has_dyn_sized_stack) @@ -316,9 +316,9 @@ define void @G_Ticker() { } ; CHECK-LABEL: RunTic: -; CHECK: .set RunTic.num_vgpr, max(46, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr) +; CHECK: .set RunTic.num_vgpr, max(47, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr) ; CHECK: .set RunTic.num_agpr, max(0, G_CheckDemoStatus.num_agpr, D_AdvanceDemo.num_agpr, G_Ticker.num_agpr) -; CHECK: .set RunTic.numbered_sgpr, max(84, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr) +; CHECK: .set RunTic.numbered_sgpr, max(105, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr) ; CHECK: .set RunTic.private_seg_size, 32+(max(G_CheckDemoStatus.private_seg_size, D_AdvanceDemo.private_seg_size, G_Ticker.private_seg_size)) ; CHECK: .set RunTic.uses_vcc, or(1, G_CheckDemoStatus.uses_vcc, D_AdvanceDemo.uses_vcc, G_Ticker.uses_vcc) ; CHECK: .set RunTic.uses_flat_scratch, or(0, G_CheckDemoStatus.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, G_Ticker.uses_flat_scratch) diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir index ba6524caf668d..05cbd4c2a010d 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -27,39 +27,25 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; CHECK: liveins: $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -89,36 +75,24 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr - ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr29, $sgpr40, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr34 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -158,16 +132,12 @@ body: | ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr40 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr40 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr2, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir index 162d12f651d4a..4f1c9a20fddc3 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -23,64 +23,42 @@ body: | liveins: $vgpr1 ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; MUBUF: liveins: $vgpr1, $vgpr2 + ; MUBUF: liveins: $sgpr40, $sgpr41, $vgpr1 ; MUBUF-NEXT: {{ $}} - ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 + ; MUBUF-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec - ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF-NEXT: $vgpr2 = V_ADD_U32_e32 16384, killed $vgpr2, implicit $exec + ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc ; ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; FLATSCR: liveins: $vgpr1, $vgpr2 + ; FLATSCR: liveins: $sgpr40, $sgpr41, $vgpr1 ; FLATSCR-NEXT: {{ $}} - ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 + ; FLATSCR-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc - ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -8192, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc - ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc + ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec + ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc + ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 + ; FLATSCR-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; FLATSCR-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir index a4f936a4d705c..480859a09a347 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -22,34 +22,22 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei - ; CHECK: liveins: $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 4096, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 4096 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 33720ea9b28e6..ee89bf406c2a3 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1776,20 +1776,20 @@ entry: define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-LABEL: DiffBase: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s42, -1 -; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s11 -; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s50, -1 +; GFX8-NEXT: s_mov_b32 s51, 0xe80000 +; GFX8-NEXT: s_add_u32 s48, s48, s11 +; GFX8-NEXT: s_addc_u32 s49, s49, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX8-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1839,20 +1839,20 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX9-LABEL: DiffBase: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s42, -1 -; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s11 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1898,12 +1898,12 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX10-LABEL: DiffBase: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s42, -1 -; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s11 -; GFX10-NEXT: s_addc_u32 s41, s41, 0 +; GFX10-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s50, -1 +; GFX10-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX10-NEXT: s_add_u32 s48, s48, s11 +; GFX10-NEXT: s_addc_u32 s49, s49, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 @@ -1911,8 +1911,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index c6ee557d970cd..4a0bb6ceccd3f 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -41,63 +41,102 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr36_sgpr37 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr38_sgpr39 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr40_sgpr41 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr60 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr42_sgpr43 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit $exec - ; CHECK-NEXT: renamable $sgpr44_sgpr45 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr61 = S_MOV_B32 1083786240 + ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec + ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 + ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr44_sgpr45, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr65 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr66 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr67 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr68 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr69 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr70 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr71 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr72 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr73 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr74 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr75 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr80 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr81 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr82 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr83 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr84 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr85 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr86 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr87 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr88 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr89 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr90 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr91 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr92 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr93 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr94 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr95 = COPY renamable $sgpr60 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr51 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr56 + ; CHECK-NEXT: renamable $sgpr53 = COPY killed renamable $sgpr76 + ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 + ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr58 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr61 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} @@ -126,111 +165,117 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr38_sgpr39, undef renamable $sgpr46_sgpr47, implicit-def dead $scc - ; CHECK-NEXT: renamable $sgpr46_sgpr47 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr40_sgpr41, implicit $exec + ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr48_sgpr49 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $sgpr50_sgpr51 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY renamable $sgpr60_sgpr61, implicit $exec + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec - ; CHECK-NEXT: renamable $sgpr58 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr4_sgpr5 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY renamable $sgpr52_sgpr53 - ; CHECK-NEXT: renamable $sgpr54_sgpr55 = COPY killed renamable $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY killed renamable $sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY renamable $sgpr68_sgpr69 + ; CHECK-NEXT: renamable $sgpr70_sgpr71 = COPY killed renamable $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr70_sgpr71 + ; CHECK-NEXT: renamable $sgpr80_sgpr81 = COPY killed renamable $sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr80_sgpr81 ; CHECK-NEXT: $sgpr12 = COPY renamable $sgpr14 ; CHECK-NEXT: $sgpr13 = COPY renamable $sgpr15 - ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr8 + ; CHECK-NEXT: renamable $sgpr84 = COPY killed renamable $sgpr8 ; CHECK-NEXT: renamable $sgpr33 = COPY killed renamable $sgpr16 - ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr15 - ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr14 + ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 + ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr58_sgpr59 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr63 - ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr59 + ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 + ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 + ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr33 - ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr52_sgpr53 - ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr62 - ; CHECK-NEXT: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69 + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71 + ; CHECK-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr80_sgpr81 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $exec = S_MOV_B64_term renamable $sgpr50_sgpr51 + ; CHECK-NEXT: $exec = S_MOV_B64_term renamable $sgpr66_sgpr67 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr46_sgpr47 + ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr42_sgpr43, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll index 570ea4b7132aa..0d25bc97ff775 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -246,10 +246,10 @@ bb3: ; CHECK-LABEL: {{^}}spill_func: ; GCN: NumSgprs: 104 ; GCN-GCNTRACKERS: NumSgprs: 104 -; GCN: NumVgprs: 3 -; GCN-GCNTRACKERS: NumVgprs: 4 -; GCN: ScratchSize: 12 -; GCN-GCNTRACKERS: ScratchSize: 16 +; GCN: NumVgprs: 2 +; GCN-GCNTRACKERS: NumVgprs: 3 +; GCN: ScratchSize: 8 +; GCN-GCNTRACKERS: ScratchSize: 12 define void @spill_func(ptr addrspace(1) %arg) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index d345b57d3d08b..e920fdee51815 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1903,19 +1903,14 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-LABEL: v_vselect_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v31, s30, 0 -; VI-NEXT: v_writelane_b32 v31, s31, 1 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 -; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29 +; VI-NEXT: v_cmp_eq_u32_e64 s[40:41], 0, v29 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 -; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31] +; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[40:41] ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 @@ -1957,8 +1952,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 ; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] ; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] -; VI-NEXT: v_readlane_b32 s31, v31, 1 -; VI-NEXT: v_readlane_b32 s30, v31, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc @@ -1976,10 +1969,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_vselect_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index dc227f745aa9a..7ee7c83e0122d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -15008,18 +15008,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -15033,18 +15025,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -15065,18 +15049,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -15090,18 +15066,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -15158,21 +15126,13 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -15183,18 +15143,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -15215,21 +15167,13 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -15240,18 +15184,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -16122,18 +16058,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -16145,18 +16073,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -16177,18 +16097,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -16200,18 +16112,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -16883,18 +16787,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -16906,18 +16802,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -16938,18 +16826,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -16961,18 +16841,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -20983,18 +20855,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21008,18 +20872,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21040,18 +20896,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21065,18 +20913,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21189,18 +21029,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21214,18 +21046,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21246,18 +21070,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21271,18 +21087,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21339,18 +21147,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21364,18 +21164,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21396,18 +21188,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21421,18 +21205,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21477,18 +21253,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21502,18 +21270,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21534,18 +21294,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21559,18 +21311,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21627,18 +21371,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21652,18 +21388,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21684,18 +21412,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21709,18 +21429,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22203,21 +21915,13 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -22228,18 +21932,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22260,21 +21956,13 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -22285,18 +21973,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22409,21 +22089,13 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -22434,18 +22106,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22466,21 +22130,13 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -22491,18 +22147,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22559,18 +22207,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22584,18 +22224,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22616,18 +22248,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22641,18 +22265,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22697,21 +22313,13 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -22722,18 +22330,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22754,21 +22354,13 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -22779,18 +22371,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22847,18 +22431,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22872,18 +22448,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22904,18 +22472,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22929,18 +22489,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -23861,18 +23413,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -23884,18 +23428,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -23916,18 +23452,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -23939,18 +23467,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -24166,21 +23686,13 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -24191,18 +23703,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -24223,21 +23727,13 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -24248,18 +23744,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25042,21 +24530,13 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -25067,18 +24547,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25099,21 +24571,13 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -25124,18 +24588,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25192,18 +24648,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -25217,18 +24665,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25249,18 +24689,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -25274,18 +24706,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25737,18 +25161,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -25760,18 +25176,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25792,18 +25200,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -25815,18 +25215,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -26154,21 +25546,13 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -26179,18 +25563,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -26211,21 +25587,13 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -26236,18 +25604,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -27030,21 +26390,13 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -27055,18 +26407,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -27087,21 +26431,13 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -27112,18 +26448,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -27180,18 +26508,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -27205,18 +26525,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -27237,18 +26549,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -27262,18 +26566,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -27989,21 +27285,13 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -28014,18 +27302,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -28046,21 +27326,13 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -28071,18 +27343,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -28139,18 +27403,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -28164,18 +27420,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -28196,18 +27444,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -28221,18 +27461,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -29011,21 +28243,13 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -29036,18 +28260,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -29068,21 +28284,13 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -29093,18 +28301,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -29161,18 +28361,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -29186,18 +28378,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -29218,18 +28402,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -29243,18 +28419,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -30057,21 +29225,13 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -30082,18 +29242,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -30114,21 +29266,13 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -30139,18 +29283,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -30207,18 +29343,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -30232,18 +29360,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -30264,18 +29384,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -30289,18 +29401,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index b57adfe7d9306..0221bb0cf4f35 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -612,40 +612,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: v_writelane_b32 v40, s37, 5 ; FIJI-NEXT: v_writelane_b32 v40, s38, 6 ; FIJI-NEXT: v_writelane_b32 v40, s39, 7 -; FIJI-NEXT: v_writelane_b32 v40, s40, 8 -; FIJI-NEXT: v_writelane_b32 v40, s41, 9 -; FIJI-NEXT: v_writelane_b32 v40, s42, 10 -; FIJI-NEXT: v_writelane_b32 v40, s43, 11 -; FIJI-NEXT: v_writelane_b32 v40, s44, 12 -; FIJI-NEXT: v_writelane_b32 v40, s45, 13 -; FIJI-NEXT: v_writelane_b32 v40, s46, 14 -; FIJI-NEXT: v_writelane_b32 v40, s47, 15 -; FIJI-NEXT: v_writelane_b32 v40, s48, 16 -; FIJI-NEXT: s_mov_b32 s42, s15 -; FIJI-NEXT: s_mov_b32 s43, s14 -; FIJI-NEXT: s_mov_b32 s44, s13 -; FIJI-NEXT: s_mov_b32 s45, s12 +; FIJI-NEXT: v_writelane_b32 v40, s48, 8 +; FIJI-NEXT: v_writelane_b32 v40, s49, 9 +; FIJI-NEXT: v_writelane_b32 v40, s50, 10 +; FIJI-NEXT: v_writelane_b32 v40, s51, 11 +; FIJI-NEXT: v_writelane_b32 v40, s52, 12 +; FIJI-NEXT: v_writelane_b32 v40, s53, 13 +; FIJI-NEXT: v_writelane_b32 v40, s54, 14 +; FIJI-NEXT: v_writelane_b32 v40, s55, 15 +; FIJI-NEXT: v_writelane_b32 v40, s64, 16 +; FIJI-NEXT: s_mov_b32 s50, s15 +; FIJI-NEXT: s_mov_b32 s51, s14 +; FIJI-NEXT: s_mov_b32 s52, s13 +; FIJI-NEXT: s_mov_b32 s53, s12 ; FIJI-NEXT: s_mov_b64 s[34:35], s[10:11] ; FIJI-NEXT: s_mov_b64 s[36:37], s[8:9] ; FIJI-NEXT: s_mov_b64 s[38:39], s[6:7] -; FIJI-NEXT: s_mov_b64 s[40:41], s[4:5] +; FIJI-NEXT: s_mov_b64 s[48:49], s[4:5] ; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; FIJI-NEXT: s_mov_b64 s[46:47], exec +; FIJI-NEXT: s_mov_b64 s[54:55], exec ; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v40, s49, 17 +; FIJI-NEXT: v_writelane_b32 v40, s65, 17 ; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; FIJI-NEXT: v_readfirstlane_b32 s16, v0 ; FIJI-NEXT: v_readfirstlane_b32 s17, v1 ; FIJI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; FIJI-NEXT: s_and_saveexec_b64 s[48:49], vcc -; FIJI-NEXT: s_mov_b64 s[4:5], s[40:41] +; FIJI-NEXT: s_and_saveexec_b64 s[64:65], vcc +; FIJI-NEXT: s_mov_b64 s[4:5], s[48:49] ; FIJI-NEXT: s_mov_b64 s[6:7], s[38:39] ; FIJI-NEXT: s_mov_b64 s[8:9], s[36:37] ; FIJI-NEXT: s_mov_b64 s[10:11], s[34:35] -; FIJI-NEXT: s_mov_b32 s12, s45 -; FIJI-NEXT: s_mov_b32 s13, s44 -; FIJI-NEXT: s_mov_b32 s14, s43 -; FIJI-NEXT: s_mov_b32 s15, s42 +; FIJI-NEXT: s_mov_b32 s12, s53 +; FIJI-NEXT: s_mov_b32 s13, s52 +; FIJI-NEXT: s_mov_b32 s14, s51 +; FIJI-NEXT: s_mov_b32 s15, s50 ; FIJI-NEXT: v_mov_b32_e32 v0, v2 ; FIJI-NEXT: v_mov_b32_e32 v1, v3 ; FIJI-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -654,21 +654,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: ; implicit-def: $vgpr31 ; FIJI-NEXT: ; implicit-def: $vgpr2 ; FIJI-NEXT: ; implicit-def: $vgpr3 -; FIJI-NEXT: s_xor_b64 exec, exec, s[48:49] +; FIJI-NEXT: s_xor_b64 exec, exec, s[64:65] ; FIJI-NEXT: s_cbranch_execnz .LBB18_1 ; FIJI-NEXT: ; %bb.2: -; FIJI-NEXT: s_mov_b64 exec, s[46:47] +; FIJI-NEXT: s_mov_b64 exec, s[54:55] ; FIJI-NEXT: v_mov_b32_e32 v0, v4 -; FIJI-NEXT: v_readlane_b32 s49, v40, 17 -; FIJI-NEXT: v_readlane_b32 s48, v40, 16 -; FIJI-NEXT: v_readlane_b32 s47, v40, 15 -; FIJI-NEXT: v_readlane_b32 s46, v40, 14 -; FIJI-NEXT: v_readlane_b32 s45, v40, 13 -; FIJI-NEXT: v_readlane_b32 s44, v40, 12 -; FIJI-NEXT: v_readlane_b32 s43, v40, 11 -; FIJI-NEXT: v_readlane_b32 s42, v40, 10 -; FIJI-NEXT: v_readlane_b32 s41, v40, 9 -; FIJI-NEXT: v_readlane_b32 s40, v40, 8 +; FIJI-NEXT: v_readlane_b32 s65, v40, 17 +; FIJI-NEXT: v_readlane_b32 s64, v40, 16 +; FIJI-NEXT: v_readlane_b32 s55, v40, 15 +; FIJI-NEXT: v_readlane_b32 s54, v40, 14 +; FIJI-NEXT: v_readlane_b32 s53, v40, 13 +; FIJI-NEXT: v_readlane_b32 s52, v40, 12 +; FIJI-NEXT: v_readlane_b32 s51, v40, 11 +; FIJI-NEXT: v_readlane_b32 s50, v40, 10 +; FIJI-NEXT: v_readlane_b32 s49, v40, 9 +; FIJI-NEXT: v_readlane_b32 s48, v40, 8 ; FIJI-NEXT: v_readlane_b32 s39, v40, 7 ; FIJI-NEXT: v_readlane_b32 s38, v40, 6 ; FIJI-NEXT: v_readlane_b32 s37, v40, 5 @@ -703,40 +703,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: v_writelane_b32 v40, s37, 5 ; HAWAII-NEXT: v_writelane_b32 v40, s38, 6 ; HAWAII-NEXT: v_writelane_b32 v40, s39, 7 -; HAWAII-NEXT: v_writelane_b32 v40, s40, 8 -; HAWAII-NEXT: v_writelane_b32 v40, s41, 9 -; HAWAII-NEXT: v_writelane_b32 v40, s42, 10 -; HAWAII-NEXT: v_writelane_b32 v40, s43, 11 -; HAWAII-NEXT: v_writelane_b32 v40, s44, 12 -; HAWAII-NEXT: v_writelane_b32 v40, s45, 13 -; HAWAII-NEXT: v_writelane_b32 v40, s46, 14 -; HAWAII-NEXT: v_writelane_b32 v40, s47, 15 -; HAWAII-NEXT: v_writelane_b32 v40, s48, 16 -; HAWAII-NEXT: s_mov_b32 s42, s15 -; HAWAII-NEXT: s_mov_b32 s43, s14 -; HAWAII-NEXT: s_mov_b32 s44, s13 -; HAWAII-NEXT: s_mov_b32 s45, s12 +; HAWAII-NEXT: v_writelane_b32 v40, s48, 8 +; HAWAII-NEXT: v_writelane_b32 v40, s49, 9 +; HAWAII-NEXT: v_writelane_b32 v40, s50, 10 +; HAWAII-NEXT: v_writelane_b32 v40, s51, 11 +; HAWAII-NEXT: v_writelane_b32 v40, s52, 12 +; HAWAII-NEXT: v_writelane_b32 v40, s53, 13 +; HAWAII-NEXT: v_writelane_b32 v40, s54, 14 +; HAWAII-NEXT: v_writelane_b32 v40, s55, 15 +; HAWAII-NEXT: v_writelane_b32 v40, s64, 16 +; HAWAII-NEXT: s_mov_b32 s50, s15 +; HAWAII-NEXT: s_mov_b32 s51, s14 +; HAWAII-NEXT: s_mov_b32 s52, s13 +; HAWAII-NEXT: s_mov_b32 s53, s12 ; HAWAII-NEXT: s_mov_b64 s[34:35], s[10:11] ; HAWAII-NEXT: s_mov_b64 s[36:37], s[8:9] ; HAWAII-NEXT: s_mov_b64 s[38:39], s[6:7] -; HAWAII-NEXT: s_mov_b64 s[40:41], s[4:5] +; HAWAII-NEXT: s_mov_b64 s[48:49], s[4:5] ; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; HAWAII-NEXT: s_mov_b64 s[46:47], exec +; HAWAII-NEXT: s_mov_b64 s[54:55], exec ; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v40, s49, 17 +; HAWAII-NEXT: v_writelane_b32 v40, s65, 17 ; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; HAWAII-NEXT: v_readfirstlane_b32 s16, v0 ; HAWAII-NEXT: v_readfirstlane_b32 s17, v1 ; HAWAII-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; HAWAII-NEXT: s_and_saveexec_b64 s[48:49], vcc -; HAWAII-NEXT: s_mov_b64 s[4:5], s[40:41] +; HAWAII-NEXT: s_and_saveexec_b64 s[64:65], vcc +; HAWAII-NEXT: s_mov_b64 s[4:5], s[48:49] ; HAWAII-NEXT: s_mov_b64 s[6:7], s[38:39] ; HAWAII-NEXT: s_mov_b64 s[8:9], s[36:37] ; HAWAII-NEXT: s_mov_b64 s[10:11], s[34:35] -; HAWAII-NEXT: s_mov_b32 s12, s45 -; HAWAII-NEXT: s_mov_b32 s13, s44 -; HAWAII-NEXT: s_mov_b32 s14, s43 -; HAWAII-NEXT: s_mov_b32 s15, s42 +; HAWAII-NEXT: s_mov_b32 s12, s53 +; HAWAII-NEXT: s_mov_b32 s13, s52 +; HAWAII-NEXT: s_mov_b32 s14, s51 +; HAWAII-NEXT: s_mov_b32 s15, s50 ; HAWAII-NEXT: v_mov_b32_e32 v0, v2 ; HAWAII-NEXT: v_mov_b32_e32 v1, v3 ; HAWAII-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -745,21 +745,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: ; implicit-def: $vgpr31 ; HAWAII-NEXT: ; implicit-def: $vgpr2 ; HAWAII-NEXT: ; implicit-def: $vgpr3 -; HAWAII-NEXT: s_xor_b64 exec, exec, s[48:49] +; HAWAII-NEXT: s_xor_b64 exec, exec, s[64:65] ; HAWAII-NEXT: s_cbranch_execnz .LBB18_1 ; HAWAII-NEXT: ; %bb.2: -; HAWAII-NEXT: s_mov_b64 exec, s[46:47] +; HAWAII-NEXT: s_mov_b64 exec, s[54:55] ; HAWAII-NEXT: v_mov_b32_e32 v0, v4 -; HAWAII-NEXT: v_readlane_b32 s49, v40, 17 -; HAWAII-NEXT: v_readlane_b32 s48, v40, 16 -; HAWAII-NEXT: v_readlane_b32 s47, v40, 15 -; HAWAII-NEXT: v_readlane_b32 s46, v40, 14 -; HAWAII-NEXT: v_readlane_b32 s45, v40, 13 -; HAWAII-NEXT: v_readlane_b32 s44, v40, 12 -; HAWAII-NEXT: v_readlane_b32 s43, v40, 11 -; HAWAII-NEXT: v_readlane_b32 s42, v40, 10 -; HAWAII-NEXT: v_readlane_b32 s41, v40, 9 -; HAWAII-NEXT: v_readlane_b32 s40, v40, 8 +; HAWAII-NEXT: v_readlane_b32 s65, v40, 17 +; HAWAII-NEXT: v_readlane_b32 s64, v40, 16 +; HAWAII-NEXT: v_readlane_b32 s55, v40, 15 +; HAWAII-NEXT: v_readlane_b32 s54, v40, 14 +; HAWAII-NEXT: v_readlane_b32 s53, v40, 13 +; HAWAII-NEXT: v_readlane_b32 s52, v40, 12 +; HAWAII-NEXT: v_readlane_b32 s51, v40, 11 +; HAWAII-NEXT: v_readlane_b32 s50, v40, 10 +; HAWAII-NEXT: v_readlane_b32 s49, v40, 9 +; HAWAII-NEXT: v_readlane_b32 s48, v40, 8 ; HAWAII-NEXT: v_readlane_b32 s39, v40, 7 ; HAWAII-NEXT: v_readlane_b32 s38, v40, 6 ; HAWAII-NEXT: v_readlane_b32 s37, v40, 5 @@ -794,40 +794,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: v_writelane_b32 v40, s37, 5 ; GFX9-NEXT: v_writelane_b32 v40, s38, 6 ; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s40, 8 -; GFX9-NEXT: v_writelane_b32 v40, s41, 9 -; GFX9-NEXT: v_writelane_b32 v40, s42, 10 -; GFX9-NEXT: v_writelane_b32 v40, s43, 11 -; GFX9-NEXT: v_writelane_b32 v40, s44, 12 -; GFX9-NEXT: v_writelane_b32 v40, s45, 13 -; GFX9-NEXT: v_writelane_b32 v40, s46, 14 -; GFX9-NEXT: v_writelane_b32 v40, s47, 15 -; GFX9-NEXT: v_writelane_b32 v40, s48, 16 -; GFX9-NEXT: s_mov_b32 s42, s15 -; GFX9-NEXT: s_mov_b32 s43, s14 -; GFX9-NEXT: s_mov_b32 s44, s13 -; GFX9-NEXT: s_mov_b32 s45, s12 +; GFX9-NEXT: v_writelane_b32 v40, s48, 8 +; GFX9-NEXT: v_writelane_b32 v40, s49, 9 +; GFX9-NEXT: v_writelane_b32 v40, s50, 10 +; GFX9-NEXT: v_writelane_b32 v40, s51, 11 +; GFX9-NEXT: v_writelane_b32 v40, s52, 12 +; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: v_writelane_b32 v40, s64, 16 +; GFX9-NEXT: s_mov_b32 s50, s15 +; GFX9-NEXT: s_mov_b32 s51, s14 +; GFX9-NEXT: s_mov_b32 s52, s13 +; GFX9-NEXT: s_mov_b32 s53, s12 ; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] ; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] ; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] -; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5] +; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: s_mov_b64 s[46:47], exec +; GFX9-NEXT: s_mov_b64 s[54:55], exec ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s49, 17 +; GFX9-NEXT: v_writelane_b32 v40, s65, 17 ; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0 ; GFX9-NEXT: v_readfirstlane_b32 s17, v1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s45 -; GFX9-NEXT: s_mov_b32 s13, s44 -; GFX9-NEXT: s_mov_b32 s14, s43 -; GFX9-NEXT: s_mov_b32 s15, s42 +; GFX9-NEXT: s_mov_b32 s12, s53 +; GFX9-NEXT: s_mov_b32 s13, s52 +; GFX9-NEXT: s_mov_b32 s14, s51 +; GFX9-NEXT: s_mov_b32 s15, s50 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -836,21 +836,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: s_xor_b64 exec, exec, s[48:49] +; GFX9-NEXT: s_xor_b64 exec, exec, s[64:65] ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-NEXT: s_mov_b64 exec, s[54:55] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_readlane_b32 s49, v40, 17 -; GFX9-NEXT: v_readlane_b32 s48, v40, 16 -; GFX9-NEXT: v_readlane_b32 s47, v40, 15 -; GFX9-NEXT: v_readlane_b32 s46, v40, 14 -; GFX9-NEXT: v_readlane_b32 s45, v40, 13 -; GFX9-NEXT: v_readlane_b32 s44, v40, 12 -; GFX9-NEXT: v_readlane_b32 s43, v40, 11 -; GFX9-NEXT: v_readlane_b32 s42, v40, 10 -; GFX9-NEXT: v_readlane_b32 s41, v40, 9 -; GFX9-NEXT: v_readlane_b32 s40, v40, 8 +; GFX9-NEXT: v_readlane_b32 s65, v40, 17 +; GFX9-NEXT: v_readlane_b32 s64, v40, 16 +; GFX9-NEXT: v_readlane_b32 s55, v40, 15 +; GFX9-NEXT: v_readlane_b32 s54, v40, 14 +; GFX9-NEXT: v_readlane_b32 s53, v40, 13 +; GFX9-NEXT: v_readlane_b32 s52, v40, 12 +; GFX9-NEXT: v_readlane_b32 s51, v40, 11 +; GFX9-NEXT: v_readlane_b32 s50, v40, 10 +; GFX9-NEXT: v_readlane_b32 s49, v40, 9 +; GFX9-NEXT: v_readlane_b32 s48, v40, 8 ; GFX9-NEXT: v_readlane_b32 s39, v40, 7 ; GFX9-NEXT: v_readlane_b32 s38, v40, 6 ; GFX9-NEXT: v_readlane_b32 s37, v40, 5 diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index 080bd052a7391..cf23a9d1e8a57 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -34,57 +34,56 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr36_sgpr37 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) ; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) - ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr48_sgpr49, 0, 0 :: (invariant load (s64), align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51 + ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr55 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vcc = COPY renamable $sgpr40_sgpr41 + ; CHECK-NEXT: $vcc = COPY renamable $sgpr34_sgpr35 ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) ; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr54_sgpr55, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr53 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 + ; CHECK-NEXT: renamable $sgpr10_sgpr11 = IMPLICIT_DEF ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index dff2bd7f7aef9..7f4f9489ea4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -53,7 +53,7 @@ body: | bb.0: liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-LABEL: name: sgpr_spill_lane_crossover - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63 @@ -63,30 +63,14 @@ body: | ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 8, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 9, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 10, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 11, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 12, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 13, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 14, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 15, $vgpr63 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir index 9b0f52cb39b01..6e8a5126ca823 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir @@ -56,20 +56,15 @@ body: | bb.0: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0 ; GCN-LABEL: name: spill_exec_copy_reserved_reg - ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr2, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0 - ; GCN-NEXT: $sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $sgpr40_sgpr41 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $sgpr28_sgpr29 = IMPLICIT_DEF ; GCN-NEXT: $vgpr1 = COPY $vgpr0 ; GCN-NEXT: S_NOP 0, implicit $sgpr28_sgpr29 - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr34_sgpr35 + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr40_sgpr41 ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0 ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1 ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8_sgpr9_sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr15, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index 0d6bccad89d82..fba85455ef693 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -7,141 +7,76 @@ define void @spill_more_than_wavesize_csr_sgprs() { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: v_writelane_b32 v0, s35, 0 ; CHECK-NEXT: v_writelane_b32 v0, s36, 1 ; CHECK-NEXT: v_writelane_b32 v0, s37, 2 ; CHECK-NEXT: v_writelane_b32 v0, s38, 3 ; CHECK-NEXT: v_writelane_b32 v0, s39, 4 -; CHECK-NEXT: v_writelane_b32 v0, s40, 5 -; CHECK-NEXT: v_writelane_b32 v0, s41, 6 -; CHECK-NEXT: v_writelane_b32 v0, s42, 7 -; CHECK-NEXT: v_writelane_b32 v0, s43, 8 -; CHECK-NEXT: v_writelane_b32 v0, s44, 9 -; CHECK-NEXT: v_writelane_b32 v0, s45, 10 -; CHECK-NEXT: v_writelane_b32 v0, s46, 11 -; CHECK-NEXT: v_writelane_b32 v0, s47, 12 -; CHECK-NEXT: v_writelane_b32 v0, s48, 13 -; CHECK-NEXT: v_writelane_b32 v0, s49, 14 -; CHECK-NEXT: v_writelane_b32 v0, s50, 15 -; CHECK-NEXT: v_writelane_b32 v0, s51, 16 -; CHECK-NEXT: v_writelane_b32 v0, s52, 17 -; CHECK-NEXT: v_writelane_b32 v0, s53, 18 -; CHECK-NEXT: v_writelane_b32 v0, s54, 19 -; CHECK-NEXT: v_writelane_b32 v0, s55, 20 -; CHECK-NEXT: v_writelane_b32 v0, s56, 21 -; CHECK-NEXT: v_writelane_b32 v0, s57, 22 -; CHECK-NEXT: v_writelane_b32 v0, s58, 23 -; CHECK-NEXT: v_writelane_b32 v0, s59, 24 -; CHECK-NEXT: v_writelane_b32 v0, s60, 25 -; CHECK-NEXT: v_writelane_b32 v0, s61, 26 -; CHECK-NEXT: v_writelane_b32 v0, s62, 27 -; CHECK-NEXT: v_writelane_b32 v0, s63, 28 -; CHECK-NEXT: v_writelane_b32 v0, s64, 29 -; CHECK-NEXT: v_writelane_b32 v0, s65, 30 -; CHECK-NEXT: v_writelane_b32 v0, s66, 31 -; CHECK-NEXT: v_writelane_b32 v0, s67, 32 -; CHECK-NEXT: v_writelane_b32 v0, s68, 33 -; CHECK-NEXT: v_writelane_b32 v0, s69, 34 -; CHECK-NEXT: v_writelane_b32 v0, s70, 35 -; CHECK-NEXT: v_writelane_b32 v0, s71, 36 -; CHECK-NEXT: v_writelane_b32 v0, s72, 37 -; CHECK-NEXT: v_writelane_b32 v0, s73, 38 -; CHECK-NEXT: v_writelane_b32 v0, s74, 39 -; CHECK-NEXT: v_writelane_b32 v0, s75, 40 -; CHECK-NEXT: v_writelane_b32 v0, s76, 41 -; CHECK-NEXT: v_writelane_b32 v0, s77, 42 -; CHECK-NEXT: v_writelane_b32 v0, s78, 43 -; CHECK-NEXT: v_writelane_b32 v0, s79, 44 -; CHECK-NEXT: v_writelane_b32 v0, s80, 45 -; CHECK-NEXT: v_writelane_b32 v0, s81, 46 -; CHECK-NEXT: v_writelane_b32 v0, s82, 47 -; CHECK-NEXT: v_writelane_b32 v0, s83, 48 -; CHECK-NEXT: v_writelane_b32 v0, s84, 49 -; CHECK-NEXT: v_writelane_b32 v0, s85, 50 -; CHECK-NEXT: v_writelane_b32 v0, s86, 51 -; CHECK-NEXT: v_writelane_b32 v0, s87, 52 -; CHECK-NEXT: v_writelane_b32 v0, s88, 53 -; CHECK-NEXT: v_writelane_b32 v0, s89, 54 -; CHECK-NEXT: v_writelane_b32 v0, s90, 55 -; CHECK-NEXT: v_writelane_b32 v0, s91, 56 -; CHECK-NEXT: v_writelane_b32 v0, s92, 57 -; CHECK-NEXT: v_writelane_b32 v0, s93, 58 -; CHECK-NEXT: v_writelane_b32 v0, s94, 59 -; CHECK-NEXT: v_writelane_b32 v0, s95, 60 -; CHECK-NEXT: v_writelane_b32 v1, s99, 0 -; CHECK-NEXT: v_writelane_b32 v0, s96, 61 -; CHECK-NEXT: v_writelane_b32 v1, s100, 1 -; CHECK-NEXT: v_writelane_b32 v0, s97, 62 -; CHECK-NEXT: v_writelane_b32 v1, s101, 2 -; CHECK-NEXT: v_writelane_b32 v0, s98, 63 -; CHECK-NEXT: v_writelane_b32 v1, s102, 3 +; CHECK-NEXT: v_writelane_b32 v0, s48, 5 +; CHECK-NEXT: v_writelane_b32 v0, s49, 6 +; CHECK-NEXT: v_writelane_b32 v0, s50, 7 +; CHECK-NEXT: v_writelane_b32 v0, s51, 8 +; CHECK-NEXT: v_writelane_b32 v0, s52, 9 +; CHECK-NEXT: v_writelane_b32 v0, s53, 10 +; CHECK-NEXT: v_writelane_b32 v0, s54, 11 +; CHECK-NEXT: v_writelane_b32 v0, s55, 12 +; CHECK-NEXT: v_writelane_b32 v0, s64, 13 +; CHECK-NEXT: v_writelane_b32 v0, s65, 14 +; CHECK-NEXT: v_writelane_b32 v0, s66, 15 +; CHECK-NEXT: v_writelane_b32 v0, s67, 16 +; CHECK-NEXT: v_writelane_b32 v0, s68, 17 +; CHECK-NEXT: v_writelane_b32 v0, s69, 18 +; CHECK-NEXT: v_writelane_b32 v0, s70, 19 +; CHECK-NEXT: v_writelane_b32 v0, s71, 20 +; CHECK-NEXT: v_writelane_b32 v0, s80, 21 +; CHECK-NEXT: v_writelane_b32 v0, s81, 22 +; CHECK-NEXT: v_writelane_b32 v0, s82, 23 +; CHECK-NEXT: v_writelane_b32 v0, s83, 24 +; CHECK-NEXT: v_writelane_b32 v0, s84, 25 +; CHECK-NEXT: v_writelane_b32 v0, s85, 26 +; CHECK-NEXT: v_writelane_b32 v0, s86, 27 +; CHECK-NEXT: v_writelane_b32 v0, s87, 28 +; CHECK-NEXT: v_writelane_b32 v0, s96, 29 +; CHECK-NEXT: v_writelane_b32 v0, s97, 30 +; CHECK-NEXT: v_writelane_b32 v0, s98, 31 +; CHECK-NEXT: v_writelane_b32 v0, s99, 32 +; CHECK-NEXT: v_writelane_b32 v0, s100, 33 +; CHECK-NEXT: v_writelane_b32 v0, s101, 34 +; CHECK-NEXT: v_writelane_b32 v0, s102, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s102, v1, 3 -; CHECK-NEXT: v_readlane_b32 s101, v1, 2 -; CHECK-NEXT: v_readlane_b32 s100, v1, 1 -; CHECK-NEXT: v_readlane_b32 s99, v1, 0 -; CHECK-NEXT: v_readlane_b32 s98, v0, 63 -; CHECK-NEXT: v_readlane_b32 s97, v0, 62 -; CHECK-NEXT: v_readlane_b32 s96, v0, 61 -; CHECK-NEXT: v_readlane_b32 s95, v0, 60 -; CHECK-NEXT: v_readlane_b32 s94, v0, 59 -; CHECK-NEXT: v_readlane_b32 s93, v0, 58 -; CHECK-NEXT: v_readlane_b32 s92, v0, 57 -; CHECK-NEXT: v_readlane_b32 s91, v0, 56 -; CHECK-NEXT: v_readlane_b32 s90, v0, 55 -; CHECK-NEXT: v_readlane_b32 s89, v0, 54 -; CHECK-NEXT: v_readlane_b32 s88, v0, 53 -; CHECK-NEXT: v_readlane_b32 s87, v0, 52 -; CHECK-NEXT: v_readlane_b32 s86, v0, 51 -; CHECK-NEXT: v_readlane_b32 s85, v0, 50 -; CHECK-NEXT: v_readlane_b32 s84, v0, 49 -; CHECK-NEXT: v_readlane_b32 s83, v0, 48 -; CHECK-NEXT: v_readlane_b32 s82, v0, 47 -; CHECK-NEXT: v_readlane_b32 s81, v0, 46 -; CHECK-NEXT: v_readlane_b32 s80, v0, 45 -; CHECK-NEXT: v_readlane_b32 s79, v0, 44 -; CHECK-NEXT: v_readlane_b32 s78, v0, 43 -; CHECK-NEXT: v_readlane_b32 s77, v0, 42 -; CHECK-NEXT: v_readlane_b32 s76, v0, 41 -; CHECK-NEXT: v_readlane_b32 s75, v0, 40 -; CHECK-NEXT: v_readlane_b32 s74, v0, 39 -; CHECK-NEXT: v_readlane_b32 s73, v0, 38 -; CHECK-NEXT: v_readlane_b32 s72, v0, 37 -; CHECK-NEXT: v_readlane_b32 s71, v0, 36 -; CHECK-NEXT: v_readlane_b32 s70, v0, 35 -; CHECK-NEXT: v_readlane_b32 s69, v0, 34 -; CHECK-NEXT: v_readlane_b32 s68, v0, 33 -; CHECK-NEXT: v_readlane_b32 s67, v0, 32 -; CHECK-NEXT: v_readlane_b32 s66, v0, 31 -; CHECK-NEXT: v_readlane_b32 s65, v0, 30 -; CHECK-NEXT: v_readlane_b32 s64, v0, 29 -; CHECK-NEXT: v_readlane_b32 s63, v0, 28 -; CHECK-NEXT: v_readlane_b32 s62, v0, 27 -; CHECK-NEXT: v_readlane_b32 s61, v0, 26 -; CHECK-NEXT: v_readlane_b32 s60, v0, 25 -; CHECK-NEXT: v_readlane_b32 s59, v0, 24 -; CHECK-NEXT: v_readlane_b32 s58, v0, 23 -; CHECK-NEXT: v_readlane_b32 s57, v0, 22 -; CHECK-NEXT: v_readlane_b32 s56, v0, 21 -; CHECK-NEXT: v_readlane_b32 s55, v0, 20 -; CHECK-NEXT: v_readlane_b32 s54, v0, 19 -; CHECK-NEXT: v_readlane_b32 s53, v0, 18 -; CHECK-NEXT: v_readlane_b32 s52, v0, 17 -; CHECK-NEXT: v_readlane_b32 s51, v0, 16 -; CHECK-NEXT: v_readlane_b32 s50, v0, 15 -; CHECK-NEXT: v_readlane_b32 s49, v0, 14 -; CHECK-NEXT: v_readlane_b32 s48, v0, 13 -; CHECK-NEXT: v_readlane_b32 s47, v0, 12 -; CHECK-NEXT: v_readlane_b32 s46, v0, 11 -; CHECK-NEXT: v_readlane_b32 s45, v0, 10 -; CHECK-NEXT: v_readlane_b32 s44, v0, 9 -; CHECK-NEXT: v_readlane_b32 s43, v0, 8 -; CHECK-NEXT: v_readlane_b32 s42, v0, 7 -; CHECK-NEXT: v_readlane_b32 s41, v0, 6 -; CHECK-NEXT: v_readlane_b32 s40, v0, 5 +; CHECK-NEXT: v_readlane_b32 s102, v0, 35 +; CHECK-NEXT: v_readlane_b32 s101, v0, 34 +; CHECK-NEXT: v_readlane_b32 s100, v0, 33 +; CHECK-NEXT: v_readlane_b32 s99, v0, 32 +; CHECK-NEXT: v_readlane_b32 s98, v0, 31 +; CHECK-NEXT: v_readlane_b32 s97, v0, 30 +; CHECK-NEXT: v_readlane_b32 s96, v0, 29 +; CHECK-NEXT: v_readlane_b32 s87, v0, 28 +; CHECK-NEXT: v_readlane_b32 s86, v0, 27 +; CHECK-NEXT: v_readlane_b32 s85, v0, 26 +; CHECK-NEXT: v_readlane_b32 s84, v0, 25 +; CHECK-NEXT: v_readlane_b32 s83, v0, 24 +; CHECK-NEXT: v_readlane_b32 s82, v0, 23 +; CHECK-NEXT: v_readlane_b32 s81, v0, 22 +; CHECK-NEXT: v_readlane_b32 s80, v0, 21 +; CHECK-NEXT: v_readlane_b32 s71, v0, 20 +; CHECK-NEXT: v_readlane_b32 s70, v0, 19 +; CHECK-NEXT: v_readlane_b32 s69, v0, 18 +; CHECK-NEXT: v_readlane_b32 s68, v0, 17 +; CHECK-NEXT: v_readlane_b32 s67, v0, 16 +; CHECK-NEXT: v_readlane_b32 s66, v0, 15 +; CHECK-NEXT: v_readlane_b32 s65, v0, 14 +; CHECK-NEXT: v_readlane_b32 s64, v0, 13 +; CHECK-NEXT: v_readlane_b32 s55, v0, 12 +; CHECK-NEXT: v_readlane_b32 s54, v0, 11 +; CHECK-NEXT: v_readlane_b32 s53, v0, 10 +; CHECK-NEXT: v_readlane_b32 s52, v0, 9 +; CHECK-NEXT: v_readlane_b32 s51, v0, 8 +; CHECK-NEXT: v_readlane_b32 s50, v0, 7 +; CHECK-NEXT: v_readlane_b32 s49, v0, 6 +; CHECK-NEXT: v_readlane_b32 s48, v0, 5 ; CHECK-NEXT: v_readlane_b32 s39, v0, 4 ; CHECK-NEXT: v_readlane_b32 s38, v0, 3 ; CHECK-NEXT: v_readlane_b32 s37, v0, 2 @@ -149,7 +84,6 @@ define void @spill_more_than_wavesize_csr_sgprs() { ; CHECK-NEXT: v_readlane_b32 s35, v0, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -172,144 +106,79 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: v_writelane_b32 v1, s35, 0 ; CHECK-NEXT: v_writelane_b32 v1, s36, 1 ; CHECK-NEXT: v_writelane_b32 v1, s37, 2 ; CHECK-NEXT: v_writelane_b32 v1, s38, 3 ; CHECK-NEXT: v_writelane_b32 v1, s39, 4 -; CHECK-NEXT: v_writelane_b32 v1, s40, 5 -; CHECK-NEXT: v_writelane_b32 v1, s41, 6 -; CHECK-NEXT: v_writelane_b32 v1, s42, 7 -; CHECK-NEXT: v_writelane_b32 v1, s43, 8 -; CHECK-NEXT: v_writelane_b32 v1, s44, 9 -; CHECK-NEXT: v_writelane_b32 v1, s45, 10 -; CHECK-NEXT: v_writelane_b32 v1, s46, 11 -; CHECK-NEXT: v_writelane_b32 v1, s47, 12 -; CHECK-NEXT: v_writelane_b32 v1, s48, 13 -; CHECK-NEXT: v_writelane_b32 v1, s49, 14 -; CHECK-NEXT: v_writelane_b32 v1, s50, 15 -; CHECK-NEXT: v_writelane_b32 v1, s51, 16 -; CHECK-NEXT: v_writelane_b32 v1, s52, 17 -; CHECK-NEXT: v_writelane_b32 v1, s53, 18 -; CHECK-NEXT: v_writelane_b32 v1, s54, 19 -; CHECK-NEXT: v_writelane_b32 v1, s55, 20 -; CHECK-NEXT: v_writelane_b32 v1, s56, 21 -; CHECK-NEXT: v_writelane_b32 v1, s57, 22 -; CHECK-NEXT: v_writelane_b32 v1, s58, 23 -; CHECK-NEXT: v_writelane_b32 v1, s59, 24 -; CHECK-NEXT: v_writelane_b32 v1, s60, 25 -; CHECK-NEXT: v_writelane_b32 v1, s61, 26 -; CHECK-NEXT: v_writelane_b32 v1, s62, 27 -; CHECK-NEXT: v_writelane_b32 v1, s63, 28 -; CHECK-NEXT: v_writelane_b32 v1, s64, 29 -; CHECK-NEXT: v_writelane_b32 v1, s65, 30 -; CHECK-NEXT: v_writelane_b32 v1, s66, 31 -; CHECK-NEXT: v_writelane_b32 v1, s67, 32 -; CHECK-NEXT: v_writelane_b32 v1, s68, 33 -; CHECK-NEXT: v_writelane_b32 v1, s69, 34 -; CHECK-NEXT: v_writelane_b32 v1, s70, 35 -; CHECK-NEXT: v_writelane_b32 v1, s71, 36 -; CHECK-NEXT: v_writelane_b32 v1, s72, 37 -; CHECK-NEXT: v_writelane_b32 v1, s73, 38 -; CHECK-NEXT: v_writelane_b32 v1, s74, 39 -; CHECK-NEXT: v_writelane_b32 v1, s75, 40 -; CHECK-NEXT: v_writelane_b32 v1, s76, 41 -; CHECK-NEXT: v_writelane_b32 v1, s77, 42 -; CHECK-NEXT: v_writelane_b32 v1, s78, 43 -; CHECK-NEXT: v_writelane_b32 v1, s79, 44 -; CHECK-NEXT: v_writelane_b32 v1, s80, 45 -; CHECK-NEXT: v_writelane_b32 v1, s81, 46 -; CHECK-NEXT: v_writelane_b32 v1, s82, 47 -; CHECK-NEXT: v_writelane_b32 v1, s83, 48 -; CHECK-NEXT: v_writelane_b32 v1, s84, 49 -; CHECK-NEXT: v_writelane_b32 v1, s85, 50 -; CHECK-NEXT: v_writelane_b32 v1, s86, 51 -; CHECK-NEXT: v_writelane_b32 v1, s87, 52 -; CHECK-NEXT: v_writelane_b32 v1, s88, 53 -; CHECK-NEXT: v_writelane_b32 v1, s89, 54 -; CHECK-NEXT: v_writelane_b32 v1, s90, 55 -; CHECK-NEXT: v_writelane_b32 v1, s91, 56 -; CHECK-NEXT: v_writelane_b32 v1, s92, 57 -; CHECK-NEXT: v_writelane_b32 v1, s93, 58 -; CHECK-NEXT: v_writelane_b32 v1, s94, 59 -; CHECK-NEXT: v_writelane_b32 v1, s95, 60 -; CHECK-NEXT: v_writelane_b32 v2, s99, 0 -; CHECK-NEXT: v_writelane_b32 v1, s96, 61 -; CHECK-NEXT: v_writelane_b32 v2, s100, 1 -; CHECK-NEXT: v_writelane_b32 v1, s97, 62 -; CHECK-NEXT: v_writelane_b32 v2, s101, 2 +; CHECK-NEXT: v_writelane_b32 v1, s48, 5 +; CHECK-NEXT: v_writelane_b32 v1, s49, 6 +; CHECK-NEXT: v_writelane_b32 v1, s50, 7 +; CHECK-NEXT: v_writelane_b32 v1, s51, 8 +; CHECK-NEXT: v_writelane_b32 v1, s52, 9 +; CHECK-NEXT: v_writelane_b32 v1, s53, 10 +; CHECK-NEXT: v_writelane_b32 v1, s54, 11 +; CHECK-NEXT: v_writelane_b32 v1, s55, 12 +; CHECK-NEXT: v_writelane_b32 v1, s64, 13 +; CHECK-NEXT: v_writelane_b32 v1, s65, 14 +; CHECK-NEXT: v_writelane_b32 v1, s66, 15 +; CHECK-NEXT: v_writelane_b32 v1, s67, 16 +; CHECK-NEXT: v_writelane_b32 v1, s68, 17 +; CHECK-NEXT: v_writelane_b32 v1, s69, 18 +; CHECK-NEXT: v_writelane_b32 v1, s70, 19 +; CHECK-NEXT: v_writelane_b32 v1, s71, 20 +; CHECK-NEXT: v_writelane_b32 v1, s80, 21 +; CHECK-NEXT: v_writelane_b32 v1, s81, 22 +; CHECK-NEXT: v_writelane_b32 v1, s82, 23 +; CHECK-NEXT: v_writelane_b32 v1, s83, 24 +; CHECK-NEXT: v_writelane_b32 v1, s84, 25 +; CHECK-NEXT: v_writelane_b32 v1, s85, 26 +; CHECK-NEXT: v_writelane_b32 v1, s86, 27 +; CHECK-NEXT: v_writelane_b32 v1, s87, 28 +; CHECK-NEXT: v_writelane_b32 v1, s96, 29 +; CHECK-NEXT: v_writelane_b32 v1, s97, 30 +; CHECK-NEXT: v_writelane_b32 v1, s98, 31 +; CHECK-NEXT: v_writelane_b32 v1, s99, 32 +; CHECK-NEXT: v_writelane_b32 v1, s100, 33 +; CHECK-NEXT: v_writelane_b32 v1, s101, 34 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_writelane_b32 v1, s98, 63 -; CHECK-NEXT: v_writelane_b32 v2, s102, 3 +; CHECK-NEXT: v_writelane_b32 v1, s102, 35 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s102, v2, 3 -; CHECK-NEXT: v_readlane_b32 s101, v2, 2 -; CHECK-NEXT: v_readlane_b32 s100, v2, 1 -; CHECK-NEXT: v_readlane_b32 s99, v2, 0 -; CHECK-NEXT: v_readlane_b32 s98, v1, 63 -; CHECK-NEXT: v_readlane_b32 s97, v1, 62 -; CHECK-NEXT: v_readlane_b32 s96, v1, 61 -; CHECK-NEXT: v_readlane_b32 s95, v1, 60 -; CHECK-NEXT: v_readlane_b32 s94, v1, 59 -; CHECK-NEXT: v_readlane_b32 s93, v1, 58 -; CHECK-NEXT: v_readlane_b32 s92, v1, 57 -; CHECK-NEXT: v_readlane_b32 s91, v1, 56 -; CHECK-NEXT: v_readlane_b32 s90, v1, 55 -; CHECK-NEXT: v_readlane_b32 s89, v1, 54 -; CHECK-NEXT: v_readlane_b32 s88, v1, 53 -; CHECK-NEXT: v_readlane_b32 s87, v1, 52 -; CHECK-NEXT: v_readlane_b32 s86, v1, 51 -; CHECK-NEXT: v_readlane_b32 s85, v1, 50 -; CHECK-NEXT: v_readlane_b32 s84, v1, 49 -; CHECK-NEXT: v_readlane_b32 s83, v1, 48 -; CHECK-NEXT: v_readlane_b32 s82, v1, 47 -; CHECK-NEXT: v_readlane_b32 s81, v1, 46 -; CHECK-NEXT: v_readlane_b32 s80, v1, 45 -; CHECK-NEXT: v_readlane_b32 s79, v1, 44 -; CHECK-NEXT: v_readlane_b32 s78, v1, 43 -; CHECK-NEXT: v_readlane_b32 s77, v1, 42 -; CHECK-NEXT: v_readlane_b32 s76, v1, 41 -; CHECK-NEXT: v_readlane_b32 s75, v1, 40 -; CHECK-NEXT: v_readlane_b32 s74, v1, 39 -; CHECK-NEXT: v_readlane_b32 s73, v1, 38 -; CHECK-NEXT: v_readlane_b32 s72, v1, 37 -; CHECK-NEXT: v_readlane_b32 s71, v1, 36 -; CHECK-NEXT: v_readlane_b32 s70, v1, 35 -; CHECK-NEXT: v_readlane_b32 s69, v1, 34 -; CHECK-NEXT: v_readlane_b32 s68, v1, 33 -; CHECK-NEXT: v_readlane_b32 s67, v1, 32 -; CHECK-NEXT: v_readlane_b32 s66, v1, 31 -; CHECK-NEXT: v_readlane_b32 s65, v1, 30 -; CHECK-NEXT: v_readlane_b32 s64, v1, 29 -; CHECK-NEXT: v_readlane_b32 s63, v1, 28 -; CHECK-NEXT: v_readlane_b32 s62, v1, 27 -; CHECK-NEXT: v_readlane_b32 s61, v1, 26 -; CHECK-NEXT: v_readlane_b32 s60, v1, 25 -; CHECK-NEXT: v_readlane_b32 s59, v1, 24 -; CHECK-NEXT: v_readlane_b32 s58, v1, 23 -; CHECK-NEXT: v_readlane_b32 s57, v1, 22 -; CHECK-NEXT: v_readlane_b32 s56, v1, 21 -; CHECK-NEXT: v_readlane_b32 s55, v1, 20 -; CHECK-NEXT: v_readlane_b32 s54, v1, 19 -; CHECK-NEXT: v_readlane_b32 s53, v1, 18 -; CHECK-NEXT: v_readlane_b32 s52, v1, 17 -; CHECK-NEXT: v_readlane_b32 s51, v1, 16 -; CHECK-NEXT: v_readlane_b32 s50, v1, 15 -; CHECK-NEXT: v_readlane_b32 s49, v1, 14 -; CHECK-NEXT: v_readlane_b32 s48, v1, 13 -; CHECK-NEXT: v_readlane_b32 s47, v1, 12 -; CHECK-NEXT: v_readlane_b32 s46, v1, 11 -; CHECK-NEXT: v_readlane_b32 s45, v1, 10 -; CHECK-NEXT: v_readlane_b32 s44, v1, 9 -; CHECK-NEXT: v_readlane_b32 s43, v1, 8 -; CHECK-NEXT: v_readlane_b32 s42, v1, 7 -; CHECK-NEXT: v_readlane_b32 s41, v1, 6 -; CHECK-NEXT: v_readlane_b32 s40, v1, 5 +; CHECK-NEXT: v_readlane_b32 s102, v1, 35 +; CHECK-NEXT: v_readlane_b32 s101, v1, 34 +; CHECK-NEXT: v_readlane_b32 s100, v1, 33 +; CHECK-NEXT: v_readlane_b32 s99, v1, 32 +; CHECK-NEXT: v_readlane_b32 s98, v1, 31 +; CHECK-NEXT: v_readlane_b32 s97, v1, 30 +; CHECK-NEXT: v_readlane_b32 s96, v1, 29 +; CHECK-NEXT: v_readlane_b32 s87, v1, 28 +; CHECK-NEXT: v_readlane_b32 s86, v1, 27 +; CHECK-NEXT: v_readlane_b32 s85, v1, 26 +; CHECK-NEXT: v_readlane_b32 s84, v1, 25 +; CHECK-NEXT: v_readlane_b32 s83, v1, 24 +; CHECK-NEXT: v_readlane_b32 s82, v1, 23 +; CHECK-NEXT: v_readlane_b32 s81, v1, 22 +; CHECK-NEXT: v_readlane_b32 s80, v1, 21 +; CHECK-NEXT: v_readlane_b32 s71, v1, 20 +; CHECK-NEXT: v_readlane_b32 s70, v1, 19 +; CHECK-NEXT: v_readlane_b32 s69, v1, 18 +; CHECK-NEXT: v_readlane_b32 s68, v1, 17 +; CHECK-NEXT: v_readlane_b32 s67, v1, 16 +; CHECK-NEXT: v_readlane_b32 s66, v1, 15 +; CHECK-NEXT: v_readlane_b32 s65, v1, 14 +; CHECK-NEXT: v_readlane_b32 s64, v1, 13 +; CHECK-NEXT: v_readlane_b32 s55, v1, 12 +; CHECK-NEXT: v_readlane_b32 s54, v1, 11 +; CHECK-NEXT: v_readlane_b32 s53, v1, 10 +; CHECK-NEXT: v_readlane_b32 s52, v1, 9 +; CHECK-NEXT: v_readlane_b32 s51, v1, 8 +; CHECK-NEXT: v_readlane_b32 s50, v1, 7 +; CHECK-NEXT: v_readlane_b32 s49, v1, 6 +; CHECK-NEXT: v_readlane_b32 s48, v1, 5 ; CHECK-NEXT: v_readlane_b32 s39, v1, 4 ; CHECK-NEXT: v_readlane_b32 s38, v1, 3 ; CHECK-NEXT: v_readlane_b32 s37, v1, 2 @@ -317,7 +186,6 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { ; CHECK-NEXT: v_readlane_b32 s35, v1, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir index 8f53ec2f992da..359152e9d2b45 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -17,70 +17,78 @@ body: | ; RA-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; RA-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_1024 = S_MOV_B32 -1 ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 -1 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[S_MOV_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) ; RA-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 0 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[S_MOV_B32_1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; RA-NEXT: {{ $}} ; RA-NEXT: bb.1: ; RA-NEXT: successors: %bb.2(0x80000000) ; RA-NEXT: {{ $}} - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub30:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub31:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 + ; RA-NEXT: [[SI_SPILL_S1024_RESTORE:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE]].sub0_sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[COPY]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; RA-NEXT: [[SI_SPILL_S1024_RESTORE1:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub30:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub31:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; RA-NEXT: {{ $}} ; RA-NEXT: bb.2: ; RA-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) @@ -95,14 +103,17 @@ body: | ; VR-NEXT: {{ $}} ; VR-NEXT: renamable $sgpr37 = S_MOV_B32 -1 ; VR-NEXT: renamable $sgpr36 = S_MOV_B32 -1 - ; VR-NEXT: renamable $sgpr68 = S_MOV_B32 0 + ; VR-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36 = S_MOV_B32 0 ; VR-NEXT: renamable $sgpr30_sgpr31 = IMPLICIT_DEF ; VR-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; VR-NEXT: {{ $}} ; VR-NEXT: bb.1: ; VR-NEXT: successors: %bb.2(0x80000000) - ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 + ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35 ; VR-NEXT: {{ $}} + ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr37 ; VR-NEXT: renamable $sgpr40 = COPY renamable $sgpr36 @@ -131,41 +142,44 @@ body: | ; VR-NEXT: renamable $sgpr63 = COPY renamable $sgpr37 ; VR-NEXT: renamable $sgpr64 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr65 = COPY renamable $sgpr37 - ; VR-NEXT: renamable $sgpr69 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr70 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr71 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr72 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr73 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr74 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr75 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr76 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr77 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr78 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr79 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr80 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr81 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr82 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr83 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr84 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr85 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr86 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr87 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr88 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr89 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr90 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr91 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr92 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr93 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr94 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr95 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr96 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr97 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr98 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr99 = COPY renamable $sgpr68 + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr37 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr40 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr41 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr42 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr43 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr44 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr45 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr46 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr47 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr48 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr49 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr50 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr51 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr52 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr53 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr54 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr55 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr56 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr57 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr58 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr59 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr60 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr61 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr62 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr63 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr64 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr65 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr66 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr67 = COPY renamable $sgpr36 + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; VR-NEXT: {{ $}} ; VR-NEXT: bb.2: ; VR-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 + ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35 ; VR-NEXT: {{ $}} ; VR-NEXT: S_NOP 0, csr_amdgpu, implicit renamable $sgpr30_sgpr31, implicit renamable $sgpr34_sgpr35 ; VR-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 0e568e3071e99..4ddde7f297172 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Check that we properly realign the stack. While 4-byte access is all @@ -415,28 +416,21 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: s_mov_b32 s41, s34 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_mov_b32 s40, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 ; GCN-NEXT: s_addk_i32 s32, 0x6000 ; GCN-NEXT: s_mov_b32 s32, s34 -; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: s_mov_b32 s34, s41 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s33, s40 ; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -460,78 +454,45 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: v_writelane_b32 v39, s4, 32 +; GCN-NEXT: v_writelane_b32 v39, s34, 33 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 ; GCN-NEXT: v_writelane_b32 v39, s39, 0 -; GCN-NEXT: v_writelane_b32 v39, s40, 1 -; GCN-NEXT: v_writelane_b32 v39, s41, 2 -; GCN-NEXT: v_writelane_b32 v39, s42, 3 -; GCN-NEXT: v_writelane_b32 v39, s43, 4 -; GCN-NEXT: v_writelane_b32 v39, s44, 5 -; GCN-NEXT: v_writelane_b32 v39, s45, 6 -; GCN-NEXT: v_writelane_b32 v39, s46, 7 -; GCN-NEXT: v_writelane_b32 v39, s47, 8 -; GCN-NEXT: v_writelane_b32 v39, s48, 9 -; GCN-NEXT: v_writelane_b32 v39, s49, 10 -; GCN-NEXT: v_writelane_b32 v39, s50, 11 -; GCN-NEXT: v_writelane_b32 v39, s51, 12 -; GCN-NEXT: v_writelane_b32 v39, s52, 13 -; GCN-NEXT: v_writelane_b32 v39, s53, 14 -; GCN-NEXT: v_writelane_b32 v39, s54, 15 -; GCN-NEXT: v_writelane_b32 v39, s55, 16 -; GCN-NEXT: v_writelane_b32 v39, s56, 17 -; GCN-NEXT: v_writelane_b32 v39, s57, 18 -; GCN-NEXT: v_writelane_b32 v39, s58, 19 -; GCN-NEXT: v_writelane_b32 v39, s59, 20 -; GCN-NEXT: v_writelane_b32 v39, s60, 21 -; GCN-NEXT: v_writelane_b32 v39, s61, 22 -; GCN-NEXT: v_writelane_b32 v39, s62, 23 -; GCN-NEXT: v_writelane_b32 v39, s63, 24 -; GCN-NEXT: v_writelane_b32 v39, s64, 25 -; GCN-NEXT: v_writelane_b32 v39, s65, 26 -; GCN-NEXT: v_writelane_b32 v39, s66, 27 -; GCN-NEXT: v_writelane_b32 v39, s67, 28 -; GCN-NEXT: v_writelane_b32 v39, s68, 29 -; GCN-NEXT: v_writelane_b32 v39, s69, 30 -; GCN-NEXT: v_writelane_b32 v39, s70, 31 -; GCN-NEXT: v_writelane_b32 v39, s71, 32 -; GCN-NEXT: v_writelane_b32 v39, s72, 33 -; GCN-NEXT: v_writelane_b32 v39, s73, 34 -; GCN-NEXT: v_writelane_b32 v39, s74, 35 -; GCN-NEXT: v_writelane_b32 v39, s75, 36 -; GCN-NEXT: v_writelane_b32 v39, s76, 37 -; GCN-NEXT: v_writelane_b32 v39, s77, 38 -; GCN-NEXT: v_writelane_b32 v39, s78, 39 -; GCN-NEXT: v_writelane_b32 v39, s79, 40 -; GCN-NEXT: v_writelane_b32 v39, s80, 41 -; GCN-NEXT: v_writelane_b32 v39, s81, 42 -; GCN-NEXT: v_writelane_b32 v39, s82, 43 -; GCN-NEXT: v_writelane_b32 v39, s83, 44 -; GCN-NEXT: v_writelane_b32 v39, s84, 45 -; GCN-NEXT: v_writelane_b32 v39, s85, 46 -; GCN-NEXT: v_writelane_b32 v39, s86, 47 -; GCN-NEXT: v_writelane_b32 v39, s87, 48 -; GCN-NEXT: v_writelane_b32 v39, s88, 49 -; GCN-NEXT: v_writelane_b32 v39, s89, 50 -; GCN-NEXT: v_writelane_b32 v39, s90, 51 -; GCN-NEXT: v_writelane_b32 v39, s91, 52 -; GCN-NEXT: v_writelane_b32 v39, s92, 53 -; GCN-NEXT: v_writelane_b32 v39, s93, 54 -; GCN-NEXT: v_writelane_b32 v39, s94, 55 -; GCN-NEXT: v_writelane_b32 v39, s95, 56 -; GCN-NEXT: v_writelane_b32 v39, s96, 57 -; GCN-NEXT: v_writelane_b32 v39, s97, 58 -; GCN-NEXT: v_writelane_b32 v39, s98, 59 -; GCN-NEXT: v_writelane_b32 v39, s99, 60 -; GCN-NEXT: v_writelane_b32 v39, s100, 61 -; GCN-NEXT: v_writelane_b32 v39, s101, 62 -; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: v_writelane_b32 v39, s48, 1 +; GCN-NEXT: v_writelane_b32 v39, s49, 2 +; GCN-NEXT: v_writelane_b32 v39, s50, 3 +; GCN-NEXT: v_writelane_b32 v39, s51, 4 +; GCN-NEXT: v_writelane_b32 v39, s52, 5 +; GCN-NEXT: v_writelane_b32 v39, s53, 6 +; GCN-NEXT: v_writelane_b32 v39, s54, 7 +; GCN-NEXT: v_writelane_b32 v39, s55, 8 +; GCN-NEXT: v_writelane_b32 v39, s64, 9 +; GCN-NEXT: v_writelane_b32 v39, s65, 10 +; GCN-NEXT: v_writelane_b32 v39, s66, 11 +; GCN-NEXT: v_writelane_b32 v39, s67, 12 +; GCN-NEXT: v_writelane_b32 v39, s68, 13 +; GCN-NEXT: v_writelane_b32 v39, s69, 14 +; GCN-NEXT: v_writelane_b32 v39, s70, 15 +; GCN-NEXT: v_writelane_b32 v39, s71, 16 +; GCN-NEXT: v_writelane_b32 v39, s80, 17 +; GCN-NEXT: v_writelane_b32 v39, s81, 18 +; GCN-NEXT: v_writelane_b32 v39, s82, 19 +; GCN-NEXT: v_writelane_b32 v39, s83, 20 +; GCN-NEXT: v_writelane_b32 v39, s84, 21 +; GCN-NEXT: v_writelane_b32 v39, s85, 22 +; GCN-NEXT: v_writelane_b32 v39, s86, 23 +; GCN-NEXT: v_writelane_b32 v39, s87, 24 +; GCN-NEXT: v_writelane_b32 v39, s96, 25 +; GCN-NEXT: v_writelane_b32 v39, s97, 26 +; GCN-NEXT: v_writelane_b32 v39, s98, 27 +; GCN-NEXT: v_writelane_b32 v39, s99, 28 +; GCN-NEXT: v_writelane_b32 v39, s100, 29 +; GCN-NEXT: v_writelane_b32 v39, s101, 30 ; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: v_writelane_b32 v39, s102, 31 ; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v39, 33 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -541,76 +502,39 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber all VGPRs ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s102, v39, 63 -; GCN-NEXT: v_readlane_b32 s101, v39, 62 -; GCN-NEXT: v_readlane_b32 s100, v39, 61 -; GCN-NEXT: v_readlane_b32 s99, v39, 60 -; GCN-NEXT: v_readlane_b32 s98, v39, 59 -; GCN-NEXT: v_readlane_b32 s97, v39, 58 -; GCN-NEXT: v_readlane_b32 s96, v39, 57 -; GCN-NEXT: v_readlane_b32 s95, v39, 56 -; GCN-NEXT: v_readlane_b32 s94, v39, 55 -; GCN-NEXT: v_readlane_b32 s93, v39, 54 -; GCN-NEXT: v_readlane_b32 s92, v39, 53 -; GCN-NEXT: v_readlane_b32 s91, v39, 52 -; GCN-NEXT: v_readlane_b32 s90, v39, 51 -; GCN-NEXT: v_readlane_b32 s89, v39, 50 -; GCN-NEXT: v_readlane_b32 s88, v39, 49 -; GCN-NEXT: v_readlane_b32 s87, v39, 48 -; GCN-NEXT: v_readlane_b32 s86, v39, 47 -; GCN-NEXT: v_readlane_b32 s85, v39, 46 -; GCN-NEXT: v_readlane_b32 s84, v39, 45 -; GCN-NEXT: v_readlane_b32 s83, v39, 44 -; GCN-NEXT: v_readlane_b32 s82, v39, 43 -; GCN-NEXT: v_readlane_b32 s81, v39, 42 -; GCN-NEXT: v_readlane_b32 s80, v39, 41 -; GCN-NEXT: v_readlane_b32 s79, v39, 40 -; GCN-NEXT: v_readlane_b32 s78, v39, 39 -; GCN-NEXT: v_readlane_b32 s77, v39, 38 -; GCN-NEXT: v_readlane_b32 s76, v39, 37 -; GCN-NEXT: v_readlane_b32 s75, v39, 36 -; GCN-NEXT: v_readlane_b32 s74, v39, 35 -; GCN-NEXT: v_readlane_b32 s73, v39, 34 -; GCN-NEXT: v_readlane_b32 s72, v39, 33 -; GCN-NEXT: v_readlane_b32 s71, v39, 32 -; GCN-NEXT: v_readlane_b32 s70, v39, 31 -; GCN-NEXT: v_readlane_b32 s69, v39, 30 -; GCN-NEXT: v_readlane_b32 s68, v39, 29 -; GCN-NEXT: v_readlane_b32 s67, v39, 28 -; GCN-NEXT: v_readlane_b32 s66, v39, 27 -; GCN-NEXT: v_readlane_b32 s65, v39, 26 -; GCN-NEXT: v_readlane_b32 s64, v39, 25 -; GCN-NEXT: v_readlane_b32 s63, v39, 24 -; GCN-NEXT: v_readlane_b32 s62, v39, 23 -; GCN-NEXT: v_readlane_b32 s61, v39, 22 -; GCN-NEXT: v_readlane_b32 s60, v39, 21 -; GCN-NEXT: v_readlane_b32 s59, v39, 20 -; GCN-NEXT: v_readlane_b32 s58, v39, 19 -; GCN-NEXT: v_readlane_b32 s57, v39, 18 -; GCN-NEXT: v_readlane_b32 s56, v39, 17 -; GCN-NEXT: v_readlane_b32 s55, v39, 16 -; GCN-NEXT: v_readlane_b32 s54, v39, 15 -; GCN-NEXT: v_readlane_b32 s53, v39, 14 -; GCN-NEXT: v_readlane_b32 s52, v39, 13 -; GCN-NEXT: v_readlane_b32 s51, v39, 12 -; GCN-NEXT: v_readlane_b32 s50, v39, 11 -; GCN-NEXT: v_readlane_b32 s49, v39, 10 -; GCN-NEXT: v_readlane_b32 s48, v39, 9 -; GCN-NEXT: v_readlane_b32 s47, v39, 8 -; GCN-NEXT: v_readlane_b32 s46, v39, 7 -; GCN-NEXT: v_readlane_b32 s45, v39, 6 -; GCN-NEXT: v_readlane_b32 s44, v39, 5 -; GCN-NEXT: v_readlane_b32 s43, v39, 4 -; GCN-NEXT: v_readlane_b32 s42, v39, 3 -; GCN-NEXT: v_readlane_b32 s41, v39, 2 -; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s102, v39, 31 +; GCN-NEXT: v_readlane_b32 s101, v39, 30 +; GCN-NEXT: v_readlane_b32 s100, v39, 29 +; GCN-NEXT: v_readlane_b32 s99, v39, 28 +; GCN-NEXT: v_readlane_b32 s98, v39, 27 +; GCN-NEXT: v_readlane_b32 s97, v39, 26 +; GCN-NEXT: v_readlane_b32 s96, v39, 25 +; GCN-NEXT: v_readlane_b32 s87, v39, 24 +; GCN-NEXT: v_readlane_b32 s86, v39, 23 +; GCN-NEXT: v_readlane_b32 s85, v39, 22 +; GCN-NEXT: v_readlane_b32 s84, v39, 21 +; GCN-NEXT: v_readlane_b32 s83, v39, 20 +; GCN-NEXT: v_readlane_b32 s82, v39, 19 +; GCN-NEXT: v_readlane_b32 s81, v39, 18 +; GCN-NEXT: v_readlane_b32 s80, v39, 17 +; GCN-NEXT: v_readlane_b32 s71, v39, 16 +; GCN-NEXT: v_readlane_b32 s70, v39, 15 +; GCN-NEXT: v_readlane_b32 s69, v39, 14 +; GCN-NEXT: v_readlane_b32 s68, v39, 13 +; GCN-NEXT: v_readlane_b32 s67, v39, 12 +; GCN-NEXT: v_readlane_b32 s66, v39, 11 +; GCN-NEXT: v_readlane_b32 s65, v39, 10 +; GCN-NEXT: v_readlane_b32 s64, v39, 9 +; GCN-NEXT: v_readlane_b32 s55, v39, 8 +; GCN-NEXT: v_readlane_b32 s54, v39, 7 +; GCN-NEXT: v_readlane_b32 s53, v39, 6 +; GCN-NEXT: v_readlane_b32 s52, v39, 5 +; GCN-NEXT: v_readlane_b32 s51, v39, 4 +; GCN-NEXT: v_readlane_b32 s50, v39, 3 +; GCN-NEXT: v_readlane_b32 s49, v39, 2 +; GCN-NEXT: v_readlane_b32 s48, v39, 1 ; GCN-NEXT: v_readlane_b32 s39, v39, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: v_readlane_b32 s4, v39, 32 ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -653,81 +577,46 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i ; GCN-NEXT: s_add_i32 s5, s33, 0x42100 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: v_writelane_b32 v39, s4, 32 +; GCN-NEXT: v_writelane_b32 v39, s34, 33 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 ; GCN-NEXT: v_writelane_b32 v39, s39, 0 -; GCN-NEXT: v_writelane_b32 v39, s40, 1 -; GCN-NEXT: v_writelane_b32 v39, s41, 2 -; GCN-NEXT: v_writelane_b32 v39, s42, 3 -; GCN-NEXT: v_writelane_b32 v39, s43, 4 -; GCN-NEXT: v_writelane_b32 v39, s44, 5 -; GCN-NEXT: v_writelane_b32 v39, s45, 6 -; GCN-NEXT: v_writelane_b32 v39, s46, 7 -; GCN-NEXT: v_writelane_b32 v39, s47, 8 -; GCN-NEXT: v_writelane_b32 v39, s48, 9 -; GCN-NEXT: v_writelane_b32 v39, s49, 10 -; GCN-NEXT: v_writelane_b32 v39, s50, 11 -; GCN-NEXT: v_writelane_b32 v39, s51, 12 -; GCN-NEXT: v_writelane_b32 v39, s52, 13 -; GCN-NEXT: v_writelane_b32 v39, s53, 14 -; GCN-NEXT: v_writelane_b32 v39, s54, 15 -; GCN-NEXT: v_writelane_b32 v39, s55, 16 -; GCN-NEXT: v_writelane_b32 v39, s56, 17 -; GCN-NEXT: v_writelane_b32 v39, s57, 18 -; GCN-NEXT: v_writelane_b32 v39, s58, 19 -; GCN-NEXT: v_writelane_b32 v39, s59, 20 -; GCN-NEXT: v_writelane_b32 v39, s60, 21 -; GCN-NEXT: v_writelane_b32 v39, s61, 22 -; GCN-NEXT: v_writelane_b32 v39, s62, 23 -; GCN-NEXT: v_writelane_b32 v39, s63, 24 -; GCN-NEXT: v_writelane_b32 v39, s64, 25 -; GCN-NEXT: v_writelane_b32 v39, s65, 26 -; GCN-NEXT: v_writelane_b32 v39, s66, 27 -; GCN-NEXT: v_writelane_b32 v39, s67, 28 -; GCN-NEXT: v_writelane_b32 v39, s68, 29 -; GCN-NEXT: v_writelane_b32 v39, s69, 30 -; GCN-NEXT: v_writelane_b32 v39, s70, 31 -; GCN-NEXT: v_writelane_b32 v39, s71, 32 -; GCN-NEXT: v_writelane_b32 v39, s72, 33 -; GCN-NEXT: v_writelane_b32 v39, s73, 34 -; GCN-NEXT: v_writelane_b32 v39, s74, 35 -; GCN-NEXT: v_writelane_b32 v39, s75, 36 -; GCN-NEXT: v_writelane_b32 v39, s76, 37 -; GCN-NEXT: v_writelane_b32 v39, s77, 38 -; GCN-NEXT: v_writelane_b32 v39, s78, 39 -; GCN-NEXT: v_writelane_b32 v39, s79, 40 -; GCN-NEXT: v_writelane_b32 v39, s80, 41 -; GCN-NEXT: v_writelane_b32 v39, s81, 42 -; GCN-NEXT: v_writelane_b32 v39, s82, 43 -; GCN-NEXT: v_writelane_b32 v39, s83, 44 -; GCN-NEXT: v_writelane_b32 v39, s84, 45 -; GCN-NEXT: v_writelane_b32 v39, s85, 46 -; GCN-NEXT: v_writelane_b32 v39, s86, 47 -; GCN-NEXT: v_writelane_b32 v39, s87, 48 -; GCN-NEXT: v_writelane_b32 v39, s88, 49 -; GCN-NEXT: v_writelane_b32 v39, s89, 50 -; GCN-NEXT: v_writelane_b32 v39, s90, 51 -; GCN-NEXT: v_writelane_b32 v39, s91, 52 -; GCN-NEXT: v_writelane_b32 v39, s92, 53 -; GCN-NEXT: v_writelane_b32 v39, s93, 54 -; GCN-NEXT: v_writelane_b32 v39, s94, 55 -; GCN-NEXT: v_writelane_b32 v39, s95, 56 -; GCN-NEXT: v_writelane_b32 v39, s96, 57 -; GCN-NEXT: v_writelane_b32 v39, s97, 58 -; GCN-NEXT: v_writelane_b32 v39, s98, 59 -; GCN-NEXT: v_writelane_b32 v39, s99, 60 -; GCN-NEXT: v_writelane_b32 v39, s100, 61 -; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_writelane_b32 v39, s48, 1 +; GCN-NEXT: v_writelane_b32 v39, s49, 2 +; GCN-NEXT: v_writelane_b32 v39, s50, 3 +; GCN-NEXT: v_writelane_b32 v39, s51, 4 +; GCN-NEXT: v_writelane_b32 v39, s52, 5 +; GCN-NEXT: v_writelane_b32 v39, s53, 6 +; GCN-NEXT: v_writelane_b32 v39, s54, 7 +; GCN-NEXT: v_writelane_b32 v39, s55, 8 +; GCN-NEXT: v_writelane_b32 v39, s64, 9 +; GCN-NEXT: v_writelane_b32 v39, s65, 10 +; GCN-NEXT: v_writelane_b32 v39, s66, 11 +; GCN-NEXT: v_writelane_b32 v39, s67, 12 +; GCN-NEXT: v_writelane_b32 v39, s68, 13 +; GCN-NEXT: v_writelane_b32 v39, s69, 14 +; GCN-NEXT: v_writelane_b32 v39, s70, 15 +; GCN-NEXT: v_writelane_b32 v39, s71, 16 +; GCN-NEXT: v_writelane_b32 v39, s80, 17 +; GCN-NEXT: v_writelane_b32 v39, s81, 18 +; GCN-NEXT: v_writelane_b32 v39, s82, 19 +; GCN-NEXT: v_writelane_b32 v39, s83, 20 +; GCN-NEXT: v_writelane_b32 v39, s84, 21 +; GCN-NEXT: v_writelane_b32 v39, s85, 22 +; GCN-NEXT: v_writelane_b32 v39, s86, 23 +; GCN-NEXT: v_writelane_b32 v39, s87, 24 +; GCN-NEXT: v_writelane_b32 v39, s96, 25 +; GCN-NEXT: v_writelane_b32 v39, s97, 26 +; GCN-NEXT: v_writelane_b32 v39, s98, 27 +; GCN-NEXT: v_writelane_b32 v39, s99, 28 +; GCN-NEXT: v_writelane_b32 v39, s100, 29 +; GCN-NEXT: v_writelane_b32 v39, s101, 30 ; GCN-NEXT: v_mov_b32_e32 v1, 0x1080 -; GCN-NEXT: v_writelane_b32 v39, s102, 63 ; GCN-NEXT: s_add_i32 s32, s32, 0x46000 +; GCN-NEXT: v_writelane_b32 v39, s102, 31 ; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v39, 33 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -737,78 +626,39 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber all VGPRs ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: v_readlane_b32 s102, v39, 63 -; GCN-NEXT: v_readlane_b32 s101, v39, 62 -; GCN-NEXT: v_readlane_b32 s100, v39, 61 -; GCN-NEXT: v_readlane_b32 s99, v39, 60 -; GCN-NEXT: v_readlane_b32 s98, v39, 59 -; GCN-NEXT: v_readlane_b32 s97, v39, 58 -; GCN-NEXT: v_readlane_b32 s96, v39, 57 -; GCN-NEXT: v_readlane_b32 s95, v39, 56 -; GCN-NEXT: v_readlane_b32 s94, v39, 55 -; GCN-NEXT: v_readlane_b32 s93, v39, 54 -; GCN-NEXT: v_readlane_b32 s92, v39, 53 -; GCN-NEXT: v_readlane_b32 s91, v39, 52 -; GCN-NEXT: v_readlane_b32 s90, v39, 51 -; GCN-NEXT: v_readlane_b32 s89, v39, 50 -; GCN-NEXT: v_readlane_b32 s88, v39, 49 -; GCN-NEXT: v_readlane_b32 s87, v39, 48 -; GCN-NEXT: v_readlane_b32 s86, v39, 47 -; GCN-NEXT: v_readlane_b32 s85, v39, 46 -; GCN-NEXT: v_readlane_b32 s84, v39, 45 -; GCN-NEXT: v_readlane_b32 s83, v39, 44 -; GCN-NEXT: v_readlane_b32 s82, v39, 43 -; GCN-NEXT: v_readlane_b32 s81, v39, 42 -; GCN-NEXT: v_readlane_b32 s80, v39, 41 -; GCN-NEXT: v_readlane_b32 s79, v39, 40 -; GCN-NEXT: v_readlane_b32 s78, v39, 39 -; GCN-NEXT: v_readlane_b32 s77, v39, 38 -; GCN-NEXT: v_readlane_b32 s76, v39, 37 -; GCN-NEXT: v_readlane_b32 s75, v39, 36 -; GCN-NEXT: v_readlane_b32 s74, v39, 35 -; GCN-NEXT: v_readlane_b32 s73, v39, 34 -; GCN-NEXT: v_readlane_b32 s72, v39, 33 -; GCN-NEXT: v_readlane_b32 s71, v39, 32 -; GCN-NEXT: v_readlane_b32 s70, v39, 31 -; GCN-NEXT: v_readlane_b32 s69, v39, 30 -; GCN-NEXT: v_readlane_b32 s68, v39, 29 -; GCN-NEXT: v_readlane_b32 s67, v39, 28 -; GCN-NEXT: v_readlane_b32 s66, v39, 27 -; GCN-NEXT: v_readlane_b32 s65, v39, 26 -; GCN-NEXT: v_readlane_b32 s64, v39, 25 -; GCN-NEXT: v_readlane_b32 s63, v39, 24 -; GCN-NEXT: v_readlane_b32 s62, v39, 23 -; GCN-NEXT: v_readlane_b32 s61, v39, 22 -; GCN-NEXT: v_readlane_b32 s60, v39, 21 -; GCN-NEXT: v_readlane_b32 s59, v39, 20 -; GCN-NEXT: v_readlane_b32 s58, v39, 19 -; GCN-NEXT: v_readlane_b32 s57, v39, 18 -; GCN-NEXT: v_readlane_b32 s56, v39, 17 -; GCN-NEXT: v_readlane_b32 s55, v39, 16 -; GCN-NEXT: v_readlane_b32 s54, v39, 15 -; GCN-NEXT: v_readlane_b32 s53, v39, 14 -; GCN-NEXT: v_readlane_b32 s52, v39, 13 -; GCN-NEXT: v_readlane_b32 s51, v39, 12 -; GCN-NEXT: v_readlane_b32 s50, v39, 11 -; GCN-NEXT: v_readlane_b32 s49, v39, 10 -; GCN-NEXT: v_readlane_b32 s48, v39, 9 -; GCN-NEXT: v_readlane_b32 s47, v39, 8 -; GCN-NEXT: v_readlane_b32 s46, v39, 7 -; GCN-NEXT: v_readlane_b32 s45, v39, 6 -; GCN-NEXT: v_readlane_b32 s44, v39, 5 -; GCN-NEXT: v_readlane_b32 s43, v39, 4 -; GCN-NEXT: v_readlane_b32 s42, v39, 3 -; GCN-NEXT: v_readlane_b32 s41, v39, 2 -; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s102, v39, 31 +; GCN-NEXT: v_readlane_b32 s101, v39, 30 +; GCN-NEXT: v_readlane_b32 s100, v39, 29 +; GCN-NEXT: v_readlane_b32 s99, v39, 28 +; GCN-NEXT: v_readlane_b32 s98, v39, 27 +; GCN-NEXT: v_readlane_b32 s97, v39, 26 +; GCN-NEXT: v_readlane_b32 s96, v39, 25 +; GCN-NEXT: v_readlane_b32 s87, v39, 24 +; GCN-NEXT: v_readlane_b32 s86, v39, 23 +; GCN-NEXT: v_readlane_b32 s85, v39, 22 +; GCN-NEXT: v_readlane_b32 s84, v39, 21 +; GCN-NEXT: v_readlane_b32 s83, v39, 20 +; GCN-NEXT: v_readlane_b32 s82, v39, 19 +; GCN-NEXT: v_readlane_b32 s81, v39, 18 +; GCN-NEXT: v_readlane_b32 s80, v39, 17 +; GCN-NEXT: v_readlane_b32 s71, v39, 16 +; GCN-NEXT: v_readlane_b32 s70, v39, 15 +; GCN-NEXT: v_readlane_b32 s69, v39, 14 +; GCN-NEXT: v_readlane_b32 s68, v39, 13 +; GCN-NEXT: v_readlane_b32 s67, v39, 12 +; GCN-NEXT: v_readlane_b32 s66, v39, 11 +; GCN-NEXT: v_readlane_b32 s65, v39, 10 +; GCN-NEXT: v_readlane_b32 s64, v39, 9 +; GCN-NEXT: v_readlane_b32 s55, v39, 8 +; GCN-NEXT: v_readlane_b32 s54, v39, 7 +; GCN-NEXT: v_readlane_b32 s53, v39, 6 +; GCN-NEXT: v_readlane_b32 s52, v39, 5 +; GCN-NEXT: v_readlane_b32 s51, v39, 4 +; GCN-NEXT: v_readlane_b32 s50, v39, 3 +; GCN-NEXT: v_readlane_b32 s49, v39, 2 +; GCN-NEXT: v_readlane_b32 s48, v39, 1 ; GCN-NEXT: v_readlane_b32 s39, v39, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: v_readlane_b32 s4, v39, 32 ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: s_add_i32 s5, s33, 0x42100 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index e3a7f5eee74cb..0ad9573ff27cd 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,14 +32,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS1-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 @@ -49,7 +49,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s54, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 @@ -59,25 +59,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 -; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 +; GLOBALNESS1-NEXT: s_mov_b32 s83, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s84, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 @@ -88,17 +91,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -122,26 +135,26 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s55, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 @@ -151,7 +164,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -164,15 +177,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -185,69 +200,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[98:99] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[80:81] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s70, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s71, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -256,15 +273,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9 +; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -286,12 +310,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 @@ -304,12 +328,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock @@ -317,14 +341,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 @@ -334,7 +358,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s54, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 @@ -344,25 +368,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 -; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 -; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 +; GLOBALNESS0-NEXT: s_mov_b32 s71, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s82, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 @@ -373,17 +400,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -407,26 +444,26 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s55, 1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 @@ -436,7 +473,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 @@ -449,15 +486,18 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -470,69 +510,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[98:99] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[80:81] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s84, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s85, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -541,15 +583,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -571,12 +618,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 @@ -589,12 +636,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock @@ -604,7 +651,7 @@ bb: br label %bb5 bb5: ; preds = %bb5.backedge, %bb - %tmp4.i.sroa.0.0 = phi <9 x double> [ poison, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ] + %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ] %tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128 store i32 0, ptr addrspace(5) null, align 4 %tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128 diff --git a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir index b80c478c3761f..0df2e651a15e1 100644 --- a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir @@ -27,11 +27,11 @@ body: | ; CHECK-NEXT: renamable $sgpr4 = COPY $sgpr0 ; CHECK-NEXT: SI_SPILL_S128_SAVE $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr5 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1056964608 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 ; CHECK-NEXT: renamable $sgpr8 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr9 = COPY renamable $sgpr5 @@ -43,46 +43,46 @@ body: | ; CHECK-NEXT: renamable $sgpr15 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $vgpr5_vgpr6 = COPY killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1088, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1200 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: KILL killed renamable $sgpr0, renamable $sgpr1 ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1264 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1328 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1392 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr2 = S_MOV_B32 1456 ; CHECK-NEXT: renamable $sgpr3 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr4 = S_MOV_B32 1520 ; CHECK-NEXT: renamable $sgpr96_sgpr97_sgpr98_sgpr99 = S_LOAD_DWORDX4_IMM killed renamable $sgpr2_sgpr3, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) - ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr76_sgpr77_sgpr78_sgpr79, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr64_sgpr65_sgpr66_sgpr67, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, killed renamable $sgpr68_sgpr69_sgpr70_sgpr71, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) ; CHECK-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; CHECK-NEXT: KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK-NEXT: KILL killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 + ; CHECK-NEXT: KILL killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: KILL killed renamable $vgpr5_vgpr6 ; CHECK-NEXT: KILL killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: KILL killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 + ; CHECK-NEXT: KILL killed renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; CHECK-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 ; CHECK-NEXT: KILL killed renamable $vgpr0 ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 killed $vgpr7, killed $vgpr8, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index d9df80ce6c1c0..f08e5be0fd742 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -22,14 +22,14 @@ define hidden void @widget() { ; GCN-NEXT: v_writelane_b32 v41, s37, 5 ; GCN-NEXT: v_writelane_b32 v41, s38, 6 ; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s40, 8 -; GCN-NEXT: v_writelane_b32 v41, s41, 9 -; GCN-NEXT: v_writelane_b32 v41, s42, 10 -; GCN-NEXT: v_writelane_b32 v41, s43, 11 -; GCN-NEXT: v_writelane_b32 v41, s44, 12 -; GCN-NEXT: v_writelane_b32 v41, s45, 13 -; GCN-NEXT: v_writelane_b32 v41, s46, 14 -; GCN-NEXT: v_writelane_b32 v41, s47, 15 +; GCN-NEXT: v_writelane_b32 v41, s48, 8 +; GCN-NEXT: v_writelane_b32 v41, s49, 9 +; GCN-NEXT: v_writelane_b32 v41, s50, 10 +; GCN-NEXT: v_writelane_b32 v41, s51, 11 +; GCN-NEXT: v_writelane_b32 v41, s52, 12 +; GCN-NEXT: v_writelane_b32 v41, s53, 13 +; GCN-NEXT: v_writelane_b32 v41, s54, 14 +; GCN-NEXT: v_writelane_b32 v41, s55, 15 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] @@ -37,7 +37,7 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b64 s[16:17], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 -; GCN-NEXT: s_mov_b64 s[46:47], 0 +; GCN-NEXT: s_mov_b64 s[54:55], 0 ; GCN-NEXT: s_mov_b64 s[18:19], 0 ; GCN-NEXT: s_cbranch_vccz .LBB0_9 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -53,29 +53,29 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b64 s[34:35], s[4:5] ; GCN-NEXT: s_mov_b64 s[36:37], s[6:7] ; GCN-NEXT: s_mov_b64 s[38:39], s[8:9] -; GCN-NEXT: s_mov_b64 s[40:41], s[10:11] -; GCN-NEXT: s_mov_b32 s42, s12 -; GCN-NEXT: s_mov_b32 s43, s13 -; GCN-NEXT: s_mov_b32 s44, s14 -; GCN-NEXT: s_mov_b32 s45, s15 +; GCN-NEXT: s_mov_b64 s[48:49], s[10:11] +; GCN-NEXT: s_mov_b32 s50, s12 +; GCN-NEXT: s_mov_b32 s51, s13 +; GCN-NEXT: s_mov_b32 s52, s14 +; GCN-NEXT: s_mov_b32 s53, s15 ; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mov_b32_e32 v31, v40 -; GCN-NEXT: s_mov_b32 s12, s42 -; GCN-NEXT: s_mov_b32 s13, s43 -; GCN-NEXT: s_mov_b32 s14, s44 -; GCN-NEXT: s_mov_b32 s15, s45 +; GCN-NEXT: s_mov_b32 s12, s50 +; GCN-NEXT: s_mov_b32 s13, s51 +; GCN-NEXT: s_mov_b32 s14, s52 +; GCN-NEXT: s_mov_b32 s15, s53 ; GCN-NEXT: s_mov_b64 s[4:5], s[34:35] ; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] ; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] -; GCN-NEXT: s_mov_b64 s[10:11], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[48:49] ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[16:17], 0 -; GCN-NEXT: s_andn2_b64 s[18:19], s[46:47], exec +; GCN-NEXT: s_andn2_b64 s[18:19], s[54:55], exec ; GCN-NEXT: s_and_b64 s[20:21], vcc, exec -; GCN-NEXT: s_or_b64 s[46:47], s[18:19], s[20:21] +; GCN-NEXT: s_or_b64 s[54:55], s[18:19], s[20:21] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[46:47] +; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[54:55] ; GCN-NEXT: s_xor_b64 s[18:19], exec, s[18:19] ; GCN-NEXT: s_cbranch_execz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %bb12 @@ -93,14 +93,14 @@ define hidden void @widget() { ; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s47, v41, 15 -; GCN-NEXT: v_readlane_b32 s46, v41, 14 -; GCN-NEXT: v_readlane_b32 s45, v41, 13 -; GCN-NEXT: v_readlane_b32 s44, v41, 12 -; GCN-NEXT: v_readlane_b32 s43, v41, 11 -; GCN-NEXT: v_readlane_b32 s42, v41, 10 -; GCN-NEXT: v_readlane_b32 s41, v41, 9 -; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s55, v41, 15 +; GCN-NEXT: v_readlane_b32 s54, v41, 14 +; GCN-NEXT: v_readlane_b32 s53, v41, 13 +; GCN-NEXT: v_readlane_b32 s52, v41, 12 +; GCN-NEXT: v_readlane_b32 s51, v41, 11 +; GCN-NEXT: v_readlane_b32 s50, v41, 10 +; GCN-NEXT: v_readlane_b32 s49, v41, 9 +; GCN-NEXT: v_readlane_b32 s48, v41, 8 ; GCN-NEXT: v_readlane_b32 s39, v41, 7 ; GCN-NEXT: v_readlane_b32 s38, v41, 6 ; GCN-NEXT: v_readlane_b32 s37, v41, 5 @@ -119,7 +119,7 @@ define hidden void @widget() { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .LBB0_9: ; %bb2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[46:47], 21, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[54:55], 21, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[18:19], 21, v0 ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB0_2 @@ -274,51 +274,51 @@ define hidden void @blam() { ; GCN-NEXT: v_writelane_b32 v45, s37, 5 ; GCN-NEXT: v_writelane_b32 v45, s38, 6 ; GCN-NEXT: v_writelane_b32 v45, s39, 7 -; GCN-NEXT: v_writelane_b32 v45, s40, 8 -; GCN-NEXT: v_writelane_b32 v45, s41, 9 -; GCN-NEXT: v_writelane_b32 v45, s42, 10 -; GCN-NEXT: v_writelane_b32 v45, s43, 11 -; GCN-NEXT: v_writelane_b32 v45, s44, 12 -; GCN-NEXT: v_writelane_b32 v45, s45, 13 -; GCN-NEXT: v_writelane_b32 v45, s46, 14 -; GCN-NEXT: v_writelane_b32 v45, s47, 15 -; GCN-NEXT: v_writelane_b32 v45, s48, 16 -; GCN-NEXT: v_writelane_b32 v45, s49, 17 -; GCN-NEXT: v_writelane_b32 v45, s50, 18 -; GCN-NEXT: v_writelane_b32 v45, s51, 19 -; GCN-NEXT: v_writelane_b32 v45, s52, 20 -; GCN-NEXT: v_writelane_b32 v45, s53, 21 -; GCN-NEXT: v_writelane_b32 v45, s54, 22 -; GCN-NEXT: v_writelane_b32 v45, s55, 23 -; GCN-NEXT: v_writelane_b32 v45, s56, 24 -; GCN-NEXT: v_writelane_b32 v45, s57, 25 +; GCN-NEXT: v_writelane_b32 v45, s48, 8 +; GCN-NEXT: v_writelane_b32 v45, s49, 9 +; GCN-NEXT: v_writelane_b32 v45, s50, 10 +; GCN-NEXT: v_writelane_b32 v45, s51, 11 +; GCN-NEXT: v_writelane_b32 v45, s52, 12 +; GCN-NEXT: v_writelane_b32 v45, s53, 13 +; GCN-NEXT: v_writelane_b32 v45, s54, 14 +; GCN-NEXT: v_writelane_b32 v45, s55, 15 +; GCN-NEXT: v_writelane_b32 v45, s64, 16 +; GCN-NEXT: v_writelane_b32 v45, s65, 17 +; GCN-NEXT: v_writelane_b32 v45, s66, 18 +; GCN-NEXT: v_writelane_b32 v45, s67, 19 +; GCN-NEXT: v_writelane_b32 v45, s68, 20 +; GCN-NEXT: v_writelane_b32 v45, s69, 21 +; GCN-NEXT: v_writelane_b32 v45, s70, 22 +; GCN-NEXT: v_writelane_b32 v45, s71, 23 +; GCN-NEXT: v_writelane_b32 v45, s80, 24 +; GCN-NEXT: v_writelane_b32 v45, s81, 25 ; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: s_mov_b32 s46, s15 -; GCN-NEXT: s_mov_b32 s47, s14 -; GCN-NEXT: s_mov_b32 s48, s13 -; GCN-NEXT: s_mov_b32 s49, s12 +; GCN-NEXT: s_mov_b32 s54, s15 +; GCN-NEXT: s_mov_b32 s55, s14 +; GCN-NEXT: s_mov_b32 s64, s13 +; GCN-NEXT: s_mov_b32 s65, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 ; GCN-NEXT: flat_load_dword v43, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: s_mov_b64 s[50:51], 0 +; GCN-NEXT: s_mov_b64 s[66:67], 0 ; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v43 -; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v43 +; GCN-NEXT: v_cmp_eq_f32_e64 s[68:69], 0, v43 +; GCN-NEXT: v_cmp_neq_f32_e64 s[50:51], 0, v43 ; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51] -; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51] +; GCN-NEXT: s_or_b64 s[66:67], s[4:5], s[66:67] +; GCN-NEXT: s_andn2_b64 exec, exec, s[66:67] ; GCN-NEXT: s_cbranch_execz .LBB1_18 ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -329,26 +329,26 @@ define hidden void @blam() { ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 ; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9] +; GCN-NEXT: s_xor_b64 s[70:71], exec, s[8:9] ; GCN-NEXT: s_cbranch_execz .LBB1_12 ; GCN-NEXT: ; %bb.3: ; %bb6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0 -; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45] +; GCN-NEXT: v_cmp_eq_u32_e64 s[52:53], 3, v0 +; GCN-NEXT: s_and_saveexec_b64 s[80:81], s[52:53] ; GCN-NEXT: s_cbranch_execz .LBB1_11 ; GCN-NEXT: ; %bb.4: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, spam@rel32@hi+12 -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s49 -; GCN-NEXT: s_mov_b32 s13, s48 -; GCN-NEXT: s_mov_b32 s14, s47 -; GCN-NEXT: s_mov_b32 s15, s46 +; GCN-NEXT: s_mov_b32 s12, s65 +; GCN-NEXT: s_mov_b32 s13, s64 +; GCN-NEXT: s_mov_b32 s14, s55 +; GCN-NEXT: s_mov_b32 s15, s54 ; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -357,13 +357,13 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_10 ; GCN-NEXT: ; %bb.5: ; %bb14 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[8:9], s[52:53] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[68:69] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[50:51] ; GCN-NEXT: s_cbranch_execz .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec +; GCN-NEXT: s_or_b64 s[8:9], s[68:69], exec ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] @@ -382,19 +382,19 @@ define hidden void @blam() { ; GCN-NEXT: .LBB1_10: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_andn2_b64 s[4:5], s[52:53], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec -; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9] +; GCN-NEXT: s_or_b64 s[52:53], s[4:5], s[8:9] ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: .LBB1_11: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[56:57] -; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_or_b64 exec, exec, s[80:81] +; GCN-NEXT: s_orn2_b64 s[4:5], s[52:53], exec ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55] +; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[70:71] ; GCN-NEXT: s_cbranch_execz .LBB1_16 ; GCN-NEXT: ; %bb.13: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -426,25 +426,25 @@ define hidden void @blam() { ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock -; GCN-NEXT: s_or_b64 exec, exec, s[50:51] -; GCN-NEXT: v_readlane_b32 s57, v45, 25 -; GCN-NEXT: v_readlane_b32 s56, v45, 24 -; GCN-NEXT: v_readlane_b32 s55, v45, 23 -; GCN-NEXT: v_readlane_b32 s54, v45, 22 -; GCN-NEXT: v_readlane_b32 s53, v45, 21 -; GCN-NEXT: v_readlane_b32 s52, v45, 20 -; GCN-NEXT: v_readlane_b32 s51, v45, 19 -; GCN-NEXT: v_readlane_b32 s50, v45, 18 -; GCN-NEXT: v_readlane_b32 s49, v45, 17 -; GCN-NEXT: v_readlane_b32 s48, v45, 16 -; GCN-NEXT: v_readlane_b32 s47, v45, 15 -; GCN-NEXT: v_readlane_b32 s46, v45, 14 -; GCN-NEXT: v_readlane_b32 s45, v45, 13 -; GCN-NEXT: v_readlane_b32 s44, v45, 12 -; GCN-NEXT: v_readlane_b32 s43, v45, 11 -; GCN-NEXT: v_readlane_b32 s42, v45, 10 -; GCN-NEXT: v_readlane_b32 s41, v45, 9 -; GCN-NEXT: v_readlane_b32 s40, v45, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[66:67] +; GCN-NEXT: v_readlane_b32 s81, v45, 25 +; GCN-NEXT: v_readlane_b32 s80, v45, 24 +; GCN-NEXT: v_readlane_b32 s71, v45, 23 +; GCN-NEXT: v_readlane_b32 s70, v45, 22 +; GCN-NEXT: v_readlane_b32 s69, v45, 21 +; GCN-NEXT: v_readlane_b32 s68, v45, 20 +; GCN-NEXT: v_readlane_b32 s67, v45, 19 +; GCN-NEXT: v_readlane_b32 s66, v45, 18 +; GCN-NEXT: v_readlane_b32 s65, v45, 17 +; GCN-NEXT: v_readlane_b32 s64, v45, 16 +; GCN-NEXT: v_readlane_b32 s55, v45, 15 +; GCN-NEXT: v_readlane_b32 s54, v45, 14 +; GCN-NEXT: v_readlane_b32 s53, v45, 13 +; GCN-NEXT: v_readlane_b32 s52, v45, 12 +; GCN-NEXT: v_readlane_b32 s51, v45, 11 +; GCN-NEXT: v_readlane_b32 s50, v45, 10 +; GCN-NEXT: v_readlane_b32 s49, v45, 9 +; GCN-NEXT: v_readlane_b32 s48, v45, 8 ; GCN-NEXT: v_readlane_b32 s39, v45, 7 ; GCN-NEXT: v_readlane_b32 s38, v45, 6 ; GCN-NEXT: v_readlane_b32 s37, v45, 5 diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir index 8a0bf26f81d22..1e815f76ee149 100644 --- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir +++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir @@ -37,100 +37,74 @@ body: | ; MUBUF-LABEL: name: use_restore_frame_reg ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr1, $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41, $vgpr1 ; MUBUF-NEXT: {{ $}} - ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 + ; MUBUF-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 155648, killed $vgpr3, implicit $exec - ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF-NEXT: $vgpr2 = V_ADD_U32_e32 155648, killed $vgpr2, implicit $exec + ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; MUBUF-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; MUBUF-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.2: - ; MUBUF-NEXT: liveins: $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; MUBUF-NEXT: S_ENDPGM 0 ; ; FLATSCR-LABEL: name: use_restore_frame_reg ; FLATSCR: bb.0: ; FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; FLATSCR-NEXT: liveins: $vgpr1, $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41, $vgpr1 ; FLATSCR-NEXT: {{ $}} - ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 + ; FLATSCR-NEXT: $sgpr40 = frame-setup COPY $sgpr33 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr41 = frame-setup COPY $sgpr34 ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc ; FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, -8192, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, -155648, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 + ; FLATSCR-NEXT: $sgpr42 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc + ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_BITSET0_B32 0, $sgpr42 + ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec + ; FLATSCR-NEXT: $sgpr42 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc + ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_BITSET0_B32 0, $sgpr42 + ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: bb.1: ; FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; FLATSCR-NEXT: liveins: $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: S_NOP 0 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: bb.2: - ; FLATSCR-NEXT: liveins: $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 + ; FLATSCR-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 + ; FLATSCR-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; FLATSCR-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll index 5ced02f28c977..2ee62d13fcc51 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll @@ -35,34 +35,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX900-NEXT: v_writelane_b32 v63, s37, 1 ; GFX900-NEXT: v_writelane_b32 v63, s38, 2 ; GFX900-NEXT: v_writelane_b32 v63, s39, 3 -; GFX900-NEXT: v_writelane_b32 v63, s40, 4 -; GFX900-NEXT: v_writelane_b32 v63, s41, 5 -; GFX900-NEXT: v_writelane_b32 v63, s42, 6 -; GFX900-NEXT: v_writelane_b32 v63, s43, 7 -; GFX900-NEXT: v_writelane_b32 v63, s44, 8 -; GFX900-NEXT: v_writelane_b32 v63, s45, 9 -; GFX900-NEXT: v_writelane_b32 v63, s46, 10 -; GFX900-NEXT: v_writelane_b32 v63, s47, 11 -; GFX900-NEXT: v_writelane_b32 v63, s48, 12 -; GFX900-NEXT: v_writelane_b32 v63, s49, 13 -; GFX900-NEXT: v_writelane_b32 v63, s50, 14 -; GFX900-NEXT: v_writelane_b32 v63, s51, 15 -; GFX900-NEXT: v_writelane_b32 v63, s52, 16 -; GFX900-NEXT: v_writelane_b32 v63, s53, 17 -; GFX900-NEXT: v_writelane_b32 v63, s54, 18 -; GFX900-NEXT: v_writelane_b32 v63, s55, 19 -; GFX900-NEXT: v_writelane_b32 v63, s56, 20 -; GFX900-NEXT: v_writelane_b32 v63, s57, 21 -; GFX900-NEXT: v_writelane_b32 v63, s58, 22 -; GFX900-NEXT: v_writelane_b32 v63, s59, 23 -; GFX900-NEXT: v_writelane_b32 v63, s60, 24 -; GFX900-NEXT: v_writelane_b32 v63, s61, 25 -; GFX900-NEXT: v_writelane_b32 v63, s62, 26 -; GFX900-NEXT: v_writelane_b32 v63, s63, 27 -; GFX900-NEXT: v_writelane_b32 v63, s64, 28 -; GFX900-NEXT: v_writelane_b32 v63, s65, 29 -; GFX900-NEXT: v_writelane_b32 v63, s66, 30 -; GFX900-NEXT: v_writelane_b32 v63, s67, 31 +; GFX900-NEXT: v_writelane_b32 v63, s48, 4 +; GFX900-NEXT: v_writelane_b32 v63, s49, 5 +; GFX900-NEXT: v_writelane_b32 v63, s50, 6 +; GFX900-NEXT: v_writelane_b32 v63, s51, 7 +; GFX900-NEXT: v_writelane_b32 v63, s52, 8 +; GFX900-NEXT: v_writelane_b32 v63, s53, 9 +; GFX900-NEXT: v_writelane_b32 v63, s54, 10 +; GFX900-NEXT: v_writelane_b32 v63, s55, 11 +; GFX900-NEXT: v_writelane_b32 v63, s64, 12 +; GFX900-NEXT: v_writelane_b32 v63, s65, 13 +; GFX900-NEXT: v_writelane_b32 v63, s66, 14 +; GFX900-NEXT: v_writelane_b32 v63, s67, 15 ; GFX900-NEXT: v_mov_b32_e32 v33, v30 ; GFX900-NEXT: v_mov_b32_e32 v34, v29 ; GFX900-NEXT: v_mov_b32_e32 v35, v28 @@ -160,34 +144,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX900-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX900-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: v_readlane_b32 s67, v63, 31 -; GFX900-NEXT: v_readlane_b32 s66, v63, 30 -; GFX900-NEXT: v_readlane_b32 s65, v63, 29 -; GFX900-NEXT: v_readlane_b32 s64, v63, 28 -; GFX900-NEXT: v_readlane_b32 s63, v63, 27 -; GFX900-NEXT: v_readlane_b32 s62, v63, 26 -; GFX900-NEXT: v_readlane_b32 s61, v63, 25 -; GFX900-NEXT: v_readlane_b32 s60, v63, 24 -; GFX900-NEXT: v_readlane_b32 s59, v63, 23 -; GFX900-NEXT: v_readlane_b32 s58, v63, 22 -; GFX900-NEXT: v_readlane_b32 s57, v63, 21 -; GFX900-NEXT: v_readlane_b32 s56, v63, 20 -; GFX900-NEXT: v_readlane_b32 s55, v63, 19 -; GFX900-NEXT: v_readlane_b32 s54, v63, 18 -; GFX900-NEXT: v_readlane_b32 s53, v63, 17 -; GFX900-NEXT: v_readlane_b32 s52, v63, 16 -; GFX900-NEXT: v_readlane_b32 s51, v63, 15 -; GFX900-NEXT: v_readlane_b32 s50, v63, 14 -; GFX900-NEXT: v_readlane_b32 s49, v63, 13 -; GFX900-NEXT: v_readlane_b32 s48, v63, 12 -; GFX900-NEXT: v_readlane_b32 s47, v63, 11 -; GFX900-NEXT: v_readlane_b32 s46, v63, 10 -; GFX900-NEXT: v_readlane_b32 s45, v63, 9 -; GFX900-NEXT: v_readlane_b32 s44, v63, 8 -; GFX900-NEXT: v_readlane_b32 s43, v63, 7 -; GFX900-NEXT: v_readlane_b32 s42, v63, 6 -; GFX900-NEXT: v_readlane_b32 s41, v63, 5 -; GFX900-NEXT: v_readlane_b32 s40, v63, 4 +; GFX900-NEXT: v_readlane_b32 s67, v63, 15 +; GFX900-NEXT: v_readlane_b32 s66, v63, 14 +; GFX900-NEXT: v_readlane_b32 s65, v63, 13 +; GFX900-NEXT: v_readlane_b32 s64, v63, 12 +; GFX900-NEXT: v_readlane_b32 s55, v63, 11 +; GFX900-NEXT: v_readlane_b32 s54, v63, 10 +; GFX900-NEXT: v_readlane_b32 s53, v63, 9 +; GFX900-NEXT: v_readlane_b32 s52, v63, 8 +; GFX900-NEXT: v_readlane_b32 s51, v63, 7 +; GFX900-NEXT: v_readlane_b32 s50, v63, 6 +; GFX900-NEXT: v_readlane_b32 s49, v63, 5 +; GFX900-NEXT: v_readlane_b32 s48, v63, 4 ; GFX900-NEXT: v_readlane_b32 s39, v63, 3 ; GFX900-NEXT: v_readlane_b32 s38, v63, 2 ; GFX900-NEXT: v_readlane_b32 s37, v63, 1 @@ -238,34 +206,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-NEXT: v_writelane_b32 v63, s37, 1 ; GFX906-NEXT: v_writelane_b32 v63, s38, 2 ; GFX906-NEXT: v_writelane_b32 v63, s39, 3 -; GFX906-NEXT: v_writelane_b32 v63, s40, 4 -; GFX906-NEXT: v_writelane_b32 v63, s41, 5 -; GFX906-NEXT: v_writelane_b32 v63, s42, 6 -; GFX906-NEXT: v_writelane_b32 v63, s43, 7 -; GFX906-NEXT: v_writelane_b32 v63, s44, 8 -; GFX906-NEXT: v_writelane_b32 v63, s45, 9 -; GFX906-NEXT: v_writelane_b32 v63, s46, 10 -; GFX906-NEXT: v_writelane_b32 v63, s47, 11 -; GFX906-NEXT: v_writelane_b32 v63, s48, 12 -; GFX906-NEXT: v_writelane_b32 v63, s49, 13 -; GFX906-NEXT: v_writelane_b32 v63, s50, 14 -; GFX906-NEXT: v_writelane_b32 v63, s51, 15 -; GFX906-NEXT: v_writelane_b32 v63, s52, 16 -; GFX906-NEXT: v_writelane_b32 v63, s53, 17 -; GFX906-NEXT: v_writelane_b32 v63, s54, 18 -; GFX906-NEXT: v_writelane_b32 v63, s55, 19 -; GFX906-NEXT: v_writelane_b32 v63, s56, 20 -; GFX906-NEXT: v_writelane_b32 v63, s57, 21 -; GFX906-NEXT: v_writelane_b32 v63, s58, 22 -; GFX906-NEXT: v_writelane_b32 v63, s59, 23 -; GFX906-NEXT: v_writelane_b32 v63, s60, 24 -; GFX906-NEXT: v_writelane_b32 v63, s61, 25 -; GFX906-NEXT: v_writelane_b32 v63, s62, 26 -; GFX906-NEXT: v_writelane_b32 v63, s63, 27 -; GFX906-NEXT: v_writelane_b32 v63, s64, 28 -; GFX906-NEXT: v_writelane_b32 v63, s65, 29 -; GFX906-NEXT: v_writelane_b32 v63, s66, 30 -; GFX906-NEXT: v_writelane_b32 v63, s67, 31 +; GFX906-NEXT: v_writelane_b32 v63, s48, 4 +; GFX906-NEXT: v_writelane_b32 v63, s49, 5 +; GFX906-NEXT: v_writelane_b32 v63, s50, 6 +; GFX906-NEXT: v_writelane_b32 v63, s51, 7 +; GFX906-NEXT: v_writelane_b32 v63, s52, 8 +; GFX906-NEXT: v_writelane_b32 v63, s53, 9 +; GFX906-NEXT: v_writelane_b32 v63, s54, 10 +; GFX906-NEXT: v_writelane_b32 v63, s55, 11 +; GFX906-NEXT: v_writelane_b32 v63, s64, 12 +; GFX906-NEXT: v_writelane_b32 v63, s65, 13 +; GFX906-NEXT: v_writelane_b32 v63, s66, 14 +; GFX906-NEXT: v_writelane_b32 v63, s67, 15 ; GFX906-NEXT: v_mov_b32_e32 v33, v30 ; GFX906-NEXT: v_mov_b32_e32 v34, v29 ; GFX906-NEXT: v_mov_b32_e32 v35, v28 @@ -363,34 +315,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX906-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_readlane_b32 s67, v63, 31 -; GFX906-NEXT: v_readlane_b32 s66, v63, 30 -; GFX906-NEXT: v_readlane_b32 s65, v63, 29 -; GFX906-NEXT: v_readlane_b32 s64, v63, 28 -; GFX906-NEXT: v_readlane_b32 s63, v63, 27 -; GFX906-NEXT: v_readlane_b32 s62, v63, 26 -; GFX906-NEXT: v_readlane_b32 s61, v63, 25 -; GFX906-NEXT: v_readlane_b32 s60, v63, 24 -; GFX906-NEXT: v_readlane_b32 s59, v63, 23 -; GFX906-NEXT: v_readlane_b32 s58, v63, 22 -; GFX906-NEXT: v_readlane_b32 s57, v63, 21 -; GFX906-NEXT: v_readlane_b32 s56, v63, 20 -; GFX906-NEXT: v_readlane_b32 s55, v63, 19 -; GFX906-NEXT: v_readlane_b32 s54, v63, 18 -; GFX906-NEXT: v_readlane_b32 s53, v63, 17 -; GFX906-NEXT: v_readlane_b32 s52, v63, 16 -; GFX906-NEXT: v_readlane_b32 s51, v63, 15 -; GFX906-NEXT: v_readlane_b32 s50, v63, 14 -; GFX906-NEXT: v_readlane_b32 s49, v63, 13 -; GFX906-NEXT: v_readlane_b32 s48, v63, 12 -; GFX906-NEXT: v_readlane_b32 s47, v63, 11 -; GFX906-NEXT: v_readlane_b32 s46, v63, 10 -; GFX906-NEXT: v_readlane_b32 s45, v63, 9 -; GFX906-NEXT: v_readlane_b32 s44, v63, 8 -; GFX906-NEXT: v_readlane_b32 s43, v63, 7 -; GFX906-NEXT: v_readlane_b32 s42, v63, 6 -; GFX906-NEXT: v_readlane_b32 s41, v63, 5 -; GFX906-NEXT: v_readlane_b32 s40, v63, 4 +; GFX906-NEXT: v_readlane_b32 s67, v63, 15 +; GFX906-NEXT: v_readlane_b32 s66, v63, 14 +; GFX906-NEXT: v_readlane_b32 s65, v63, 13 +; GFX906-NEXT: v_readlane_b32 s64, v63, 12 +; GFX906-NEXT: v_readlane_b32 s55, v63, 11 +; GFX906-NEXT: v_readlane_b32 s54, v63, 10 +; GFX906-NEXT: v_readlane_b32 s53, v63, 9 +; GFX906-NEXT: v_readlane_b32 s52, v63, 8 +; GFX906-NEXT: v_readlane_b32 s51, v63, 7 +; GFX906-NEXT: v_readlane_b32 s50, v63, 6 +; GFX906-NEXT: v_readlane_b32 s49, v63, 5 +; GFX906-NEXT: v_readlane_b32 s48, v63, 4 ; GFX906-NEXT: v_readlane_b32 s39, v63, 3 ; GFX906-NEXT: v_readlane_b32 s38, v63, 2 ; GFX906-NEXT: v_readlane_b32 s37, v63, 1 @@ -440,34 +376,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: v_writelane_b32 v62, s37, 1 ; GFX908-NEXT: v_writelane_b32 v62, s38, 2 ; GFX908-NEXT: v_writelane_b32 v62, s39, 3 -; GFX908-NEXT: v_writelane_b32 v62, s40, 4 -; GFX908-NEXT: v_writelane_b32 v62, s41, 5 -; GFX908-NEXT: v_writelane_b32 v62, s42, 6 -; GFX908-NEXT: v_writelane_b32 v62, s43, 7 -; GFX908-NEXT: v_writelane_b32 v62, s44, 8 -; GFX908-NEXT: v_writelane_b32 v62, s45, 9 -; GFX908-NEXT: v_writelane_b32 v62, s46, 10 -; GFX908-NEXT: v_writelane_b32 v62, s47, 11 -; GFX908-NEXT: v_writelane_b32 v62, s48, 12 -; GFX908-NEXT: v_writelane_b32 v62, s49, 13 -; GFX908-NEXT: v_writelane_b32 v62, s50, 14 -; GFX908-NEXT: v_writelane_b32 v62, s51, 15 -; GFX908-NEXT: v_writelane_b32 v62, s52, 16 -; GFX908-NEXT: v_writelane_b32 v62, s53, 17 -; GFX908-NEXT: v_writelane_b32 v62, s54, 18 -; GFX908-NEXT: v_writelane_b32 v62, s55, 19 -; GFX908-NEXT: v_writelane_b32 v62, s56, 20 -; GFX908-NEXT: v_writelane_b32 v62, s57, 21 -; GFX908-NEXT: v_writelane_b32 v62, s58, 22 -; GFX908-NEXT: v_writelane_b32 v62, s59, 23 -; GFX908-NEXT: v_writelane_b32 v62, s60, 24 -; GFX908-NEXT: v_writelane_b32 v62, s61, 25 -; GFX908-NEXT: v_writelane_b32 v62, s62, 26 -; GFX908-NEXT: v_writelane_b32 v62, s63, 27 -; GFX908-NEXT: v_writelane_b32 v62, s64, 28 -; GFX908-NEXT: v_writelane_b32 v62, s65, 29 -; GFX908-NEXT: v_writelane_b32 v62, s66, 30 -; GFX908-NEXT: v_writelane_b32 v62, s67, 31 +; GFX908-NEXT: v_writelane_b32 v62, s48, 4 +; GFX908-NEXT: v_writelane_b32 v62, s49, 5 +; GFX908-NEXT: v_writelane_b32 v62, s50, 6 +; GFX908-NEXT: v_writelane_b32 v62, s51, 7 +; GFX908-NEXT: v_writelane_b32 v62, s52, 8 +; GFX908-NEXT: v_writelane_b32 v62, s53, 9 +; GFX908-NEXT: v_writelane_b32 v62, s54, 10 +; GFX908-NEXT: v_writelane_b32 v62, s55, 11 +; GFX908-NEXT: v_writelane_b32 v62, s64, 12 +; GFX908-NEXT: v_writelane_b32 v62, s65, 13 +; GFX908-NEXT: v_writelane_b32 v62, s66, 14 +; GFX908-NEXT: v_writelane_b32 v62, s67, 15 ; GFX908-NEXT: v_mov_b32_e32 v33, v30 ; GFX908-NEXT: v_mov_b32_e32 v34, v29 ; GFX908-NEXT: v_mov_b32_e32 v35, v28 @@ -569,34 +489,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX908-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_readlane_b32 s67, v62, 31 -; GFX908-NEXT: v_readlane_b32 s66, v62, 30 -; GFX908-NEXT: v_readlane_b32 s65, v62, 29 -; GFX908-NEXT: v_readlane_b32 s64, v62, 28 -; GFX908-NEXT: v_readlane_b32 s63, v62, 27 -; GFX908-NEXT: v_readlane_b32 s62, v62, 26 -; GFX908-NEXT: v_readlane_b32 s61, v62, 25 -; GFX908-NEXT: v_readlane_b32 s60, v62, 24 -; GFX908-NEXT: v_readlane_b32 s59, v62, 23 -; GFX908-NEXT: v_readlane_b32 s58, v62, 22 -; GFX908-NEXT: v_readlane_b32 s57, v62, 21 -; GFX908-NEXT: v_readlane_b32 s56, v62, 20 -; GFX908-NEXT: v_readlane_b32 s55, v62, 19 -; GFX908-NEXT: v_readlane_b32 s54, v62, 18 -; GFX908-NEXT: v_readlane_b32 s53, v62, 17 -; GFX908-NEXT: v_readlane_b32 s52, v62, 16 -; GFX908-NEXT: v_readlane_b32 s51, v62, 15 -; GFX908-NEXT: v_readlane_b32 s50, v62, 14 -; GFX908-NEXT: v_readlane_b32 s49, v62, 13 -; GFX908-NEXT: v_readlane_b32 s48, v62, 12 -; GFX908-NEXT: v_readlane_b32 s47, v62, 11 -; GFX908-NEXT: v_readlane_b32 s46, v62, 10 -; GFX908-NEXT: v_readlane_b32 s45, v62, 9 -; GFX908-NEXT: v_readlane_b32 s44, v62, 8 -; GFX908-NEXT: v_readlane_b32 s43, v62, 7 -; GFX908-NEXT: v_readlane_b32 s42, v62, 6 -; GFX908-NEXT: v_readlane_b32 s41, v62, 5 -; GFX908-NEXT: v_readlane_b32 s40, v62, 4 +; GFX908-NEXT: v_readlane_b32 s67, v62, 15 +; GFX908-NEXT: v_readlane_b32 s66, v62, 14 +; GFX908-NEXT: v_readlane_b32 s65, v62, 13 +; GFX908-NEXT: v_readlane_b32 s64, v62, 12 +; GFX908-NEXT: v_readlane_b32 s55, v62, 11 +; GFX908-NEXT: v_readlane_b32 s54, v62, 10 +; GFX908-NEXT: v_readlane_b32 s53, v62, 9 +; GFX908-NEXT: v_readlane_b32 s52, v62, 8 +; GFX908-NEXT: v_readlane_b32 s51, v62, 7 +; GFX908-NEXT: v_readlane_b32 s50, v62, 6 +; GFX908-NEXT: v_readlane_b32 s49, v62, 5 +; GFX908-NEXT: v_readlane_b32 s48, v62, 4 ; GFX908-NEXT: v_readlane_b32 s39, v62, 3 ; GFX908-NEXT: v_readlane_b32 s38, v62, 2 ; GFX908-NEXT: v_readlane_b32 s37, v62, 1 @@ -646,34 +550,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_writelane_b32 v63, s37, 1 ; GFX90a-NEXT: v_writelane_b32 v63, s38, 2 ; GFX90a-NEXT: v_writelane_b32 v63, s39, 3 -; GFX90a-NEXT: v_writelane_b32 v63, s40, 4 -; GFX90a-NEXT: v_writelane_b32 v63, s41, 5 -; GFX90a-NEXT: v_writelane_b32 v63, s42, 6 -; GFX90a-NEXT: v_writelane_b32 v63, s43, 7 -; GFX90a-NEXT: v_writelane_b32 v63, s44, 8 -; GFX90a-NEXT: v_writelane_b32 v63, s45, 9 -; GFX90a-NEXT: v_writelane_b32 v63, s46, 10 -; GFX90a-NEXT: v_writelane_b32 v63, s47, 11 -; GFX90a-NEXT: v_writelane_b32 v63, s48, 12 -; GFX90a-NEXT: v_writelane_b32 v63, s49, 13 -; GFX90a-NEXT: v_writelane_b32 v63, s50, 14 -; GFX90a-NEXT: v_writelane_b32 v63, s51, 15 -; GFX90a-NEXT: v_writelane_b32 v63, s52, 16 -; GFX90a-NEXT: v_writelane_b32 v63, s53, 17 -; GFX90a-NEXT: v_writelane_b32 v63, s54, 18 -; GFX90a-NEXT: v_writelane_b32 v63, s55, 19 -; GFX90a-NEXT: v_writelane_b32 v63, s56, 20 -; GFX90a-NEXT: v_writelane_b32 v63, s57, 21 -; GFX90a-NEXT: v_writelane_b32 v63, s58, 22 -; GFX90a-NEXT: v_writelane_b32 v63, s59, 23 -; GFX90a-NEXT: v_writelane_b32 v63, s60, 24 -; GFX90a-NEXT: v_writelane_b32 v63, s61, 25 -; GFX90a-NEXT: v_writelane_b32 v63, s62, 26 -; GFX90a-NEXT: v_writelane_b32 v63, s63, 27 -; GFX90a-NEXT: v_writelane_b32 v63, s64, 28 -; GFX90a-NEXT: v_writelane_b32 v63, s65, 29 -; GFX90a-NEXT: v_writelane_b32 v63, s66, 30 -; GFX90a-NEXT: v_writelane_b32 v63, s67, 31 +; GFX90a-NEXT: v_writelane_b32 v63, s48, 4 +; GFX90a-NEXT: v_writelane_b32 v63, s49, 5 +; GFX90a-NEXT: v_writelane_b32 v63, s50, 6 +; GFX90a-NEXT: v_writelane_b32 v63, s51, 7 +; GFX90a-NEXT: v_writelane_b32 v63, s52, 8 +; GFX90a-NEXT: v_writelane_b32 v63, s53, 9 +; GFX90a-NEXT: v_writelane_b32 v63, s54, 10 +; GFX90a-NEXT: v_writelane_b32 v63, s55, 11 +; GFX90a-NEXT: v_writelane_b32 v63, s64, 12 +; GFX90a-NEXT: v_writelane_b32 v63, s65, 13 +; GFX90a-NEXT: v_writelane_b32 v63, s66, 14 +; GFX90a-NEXT: v_writelane_b32 v63, s67, 15 ; GFX90a-NEXT: v_mov_b32_e32 v33, v30 ; GFX90a-NEXT: v_mov_b32_e32 v34, v29 ; GFX90a-NEXT: v_mov_b32_e32 v35, v28 @@ -771,34 +659,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX90a-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_readlane_b32 s67, v63, 31 -; GFX90a-NEXT: v_readlane_b32 s66, v63, 30 -; GFX90a-NEXT: v_readlane_b32 s65, v63, 29 -; GFX90a-NEXT: v_readlane_b32 s64, v63, 28 -; GFX90a-NEXT: v_readlane_b32 s63, v63, 27 -; GFX90a-NEXT: v_readlane_b32 s62, v63, 26 -; GFX90a-NEXT: v_readlane_b32 s61, v63, 25 -; GFX90a-NEXT: v_readlane_b32 s60, v63, 24 -; GFX90a-NEXT: v_readlane_b32 s59, v63, 23 -; GFX90a-NEXT: v_readlane_b32 s58, v63, 22 -; GFX90a-NEXT: v_readlane_b32 s57, v63, 21 -; GFX90a-NEXT: v_readlane_b32 s56, v63, 20 -; GFX90a-NEXT: v_readlane_b32 s55, v63, 19 -; GFX90a-NEXT: v_readlane_b32 s54, v63, 18 -; GFX90a-NEXT: v_readlane_b32 s53, v63, 17 -; GFX90a-NEXT: v_readlane_b32 s52, v63, 16 -; GFX90a-NEXT: v_readlane_b32 s51, v63, 15 -; GFX90a-NEXT: v_readlane_b32 s50, v63, 14 -; GFX90a-NEXT: v_readlane_b32 s49, v63, 13 -; GFX90a-NEXT: v_readlane_b32 s48, v63, 12 -; GFX90a-NEXT: v_readlane_b32 s47, v63, 11 -; GFX90a-NEXT: v_readlane_b32 s46, v63, 10 -; GFX90a-NEXT: v_readlane_b32 s45, v63, 9 -; GFX90a-NEXT: v_readlane_b32 s44, v63, 8 -; GFX90a-NEXT: v_readlane_b32 s43, v63, 7 -; GFX90a-NEXT: v_readlane_b32 s42, v63, 6 -; GFX90a-NEXT: v_readlane_b32 s41, v63, 5 -; GFX90a-NEXT: v_readlane_b32 s40, v63, 4 +; GFX90a-NEXT: v_readlane_b32 s67, v63, 15 +; GFX90a-NEXT: v_readlane_b32 s66, v63, 14 +; GFX90a-NEXT: v_readlane_b32 s65, v63, 13 +; GFX90a-NEXT: v_readlane_b32 s64, v63, 12 +; GFX90a-NEXT: v_readlane_b32 s55, v63, 11 +; GFX90a-NEXT: v_readlane_b32 s54, v63, 10 +; GFX90a-NEXT: v_readlane_b32 s53, v63, 9 +; GFX90a-NEXT: v_readlane_b32 s52, v63, 8 +; GFX90a-NEXT: v_readlane_b32 s51, v63, 7 +; GFX90a-NEXT: v_readlane_b32 s50, v63, 6 +; GFX90a-NEXT: v_readlane_b32 s49, v63, 5 +; GFX90a-NEXT: v_readlane_b32 s48, v63, 4 ; GFX90a-NEXT: v_readlane_b32 s39, v63, 3 ; GFX90a-NEXT: v_readlane_b32 s38, v63, 2 ; GFX90a-NEXT: v_readlane_b32 s37, v63, 1 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir index 4d6e33cf0b68a..b427c5bdd7229 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir @@ -1,8 +1,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck %s # CHECK: csr_sgpr_spill -# CHECK: spillPhysVGPRs -# CHECK-NEXT: - '$vgpr0' +# CHECK-NOT: spillPhysVGPRs --- name: csr_sgpr_spill tracksRegLiveness: true