Skip to content

Commit

Permalink
[AMDGPU] Change SGPR layout to striped caller/callee saved (llvm#127353)
Browse files Browse the repository at this point in the history
This PR updates the SGPR layout to a striped caller/callee-saved design,
similar
to the VGPR layout.

To ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame
pointer), and s34 (base pointer) remain callee-saved, the striped layout
starts
from s40, with a stripe width of 8. The last stripe is 10 wide instead
of 8 to
avoid ending with a 2-wide stripe.

Fixes llvm#113782.
  • Loading branch information
shiltian authored Mar 8, 2025
1 parent e86c5a7 commit a779af3
Show file tree
Hide file tree
Showing 57 changed files with 9,334 additions and 13,577 deletions.
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,15 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
>;

def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
(sequence "SGPR%u", 30, 105)
// Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer),
// and s34 (base pointer) are callee-saved. The striped layout starts from s40,
// with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid
// ending with a 2-wide stripe.
(add (sequence "SGPR%u", 30, 39),
(sequence "SGPR%u", 48, 55),
(sequence "SGPR%u", 64, 71),
(sequence "SGPR%u", 80, 87),
(sequence "SGPR%u", 96, 105))
>;

def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<
Expand Down
230 changes: 115 additions & 115 deletions llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Large diffs are not rendered by default.

335 changes: 92 additions & 243 deletions llvm/test/CodeGen/AMDGPU/bf16.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -9,41 +9,41 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_addc_u32 s13, s13, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b32 s12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s40, 0
; CHECK-NEXT: s_cmp_lg_u32 s52, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_8
; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
; CHECK-NEXT: s_cmp_eq_u32 s42, 0
; CHECK-NEXT: s_cmp_eq_u32 s54, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
; CHECK-NEXT: s_cmp_lg_u32 s43, 0
; CHECK-NEXT: s_cmp_lg_u32 s55, 0
; CHECK-NEXT: s_mov_b32 s17, 0
; CHECK-NEXT: s_cselect_b32 s12, -1, 0
; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_mov_b32 s36, 0
; CHECK-NEXT: s_mov_b32 s48, 0
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
; CHECK-NEXT: s_cbranch_vccz .LBB0_6
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: s_mov_b32 s14, s12
; CHECK-NEXT: s_mov_b32 s15, s12
; CHECK-NEXT: s_mov_b32 s13, s12
; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15]
; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13]
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
; CHECK-NEXT: s_mov_b32 s36, 1.0
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0
; CHECK-NEXT: s_mov_b32 s48, 1.0
; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
; CHECK-NEXT: s_mov_b32 s39, s36
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
; CHECK-NEXT: s_mov_b32 s51, s48
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
; CHECK-NEXT: s_cbranch_vccnz .LBB0_7
; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i
Expand All @@ -55,7 +55,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1
; CHECK-NEXT: v_add_f32_e64 v1, s17, s36
; CHECK-NEXT: v_add_f32_e64 v1, s17, s48
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13]
; CHECK-NEXT: s_mov_b32 s12, s14
Expand All @@ -65,13 +65,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s14, s16
; CHECK-NEXT: s_mov_b32 s36, 0
; CHECK-NEXT: s_mov_b32 s48, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35]
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
; CHECK-NEXT: s_mov_b32 s39, s36
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
; CHECK-NEXT: s_mov_b32 s51, s48
; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
Expand All @@ -80,11 +80,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v0, s48
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s37
; CHECK-NEXT: v_mov_b32_e32 v2, s38
; CHECK-NEXT: v_mov_b32_e32 v3, s39
; CHECK-NEXT: v_mov_b32_e32 v1, s49
; CHECK-NEXT: v_mov_b32_e32 v2, s50
; CHECK-NEXT: v_mov_b32_e32 v3, s51
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; CHECK-NEXT: s_endpgm
Expand Down
Loading

0 comments on commit a779af3

Please sign in to comment.