AMDGPU previously had no target-specific LSR cost model, so the generic heuristic would often introduce extra induction variables and base-add chains that hurt VALU throughput on GFX9+ (observed on gfx942). Implement a custom cost model: - isLSRCostLess(): prioritize per-iteration instruction count over setup costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls back to the default comparator. - getScalingFactorCost(): report that base+scale*index addressing requires an extra ADD instruction. - isNumRegsMajorCostOfLSR(): return false. - shouldDropLSRSolutionIfLessProfitable(): return true. Assisted-by: Claude Opus
102 lines
4.2 KiB
LLVM
102 lines
4.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %s | FileCheck %s
|
|
; Test for DS pure prefetch pattern: DS loads in loop are NOT used in the same
|
|
; iteration but in the next iteration (USE before DEF in program order via phi).
|
|
; Expected: "s_wait_dscnt 0" in preheader before loop entry.
|
|
|
|
define amdgpu_kernel void @ds_prefetch_pattern(ptr addrspace(3) %lds, ptr addrspace(1) %out, i32 %n) {
|
|
; CHECK-LABEL: ds_prefetch_pattern:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
|
; CHECK-NEXT: s_clause 0x1
|
|
; CHECK-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
|
|
; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x10 nv
|
|
; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v0
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; CHECK-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
|
|
; CHECK-NEXT: v_mov_b32_e32 v7, v4
|
|
; CHECK-NEXT: s_wait_kmcnt 0x0
|
|
; CHECK-NEXT: v_lshl_add_u32 v13, v12, 8, s1
|
|
; CHECK-NEXT: s_mov_b32 s1, 0
|
|
; CHECK-NEXT: ds_load_b128 v[8:11], v13
|
|
; CHECK-NEXT: ds_load_b128 v[0:3], v13 offset:16
|
|
; CHECK-NEXT: s_wait_dscnt 0x0
|
|
; CHECK-NEXT: .LBB0_1: ; %loop
|
|
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: s_barrier_signal -1
|
|
; CHECK-NEXT: s_add_co_i32 s1, s1, 1
|
|
; CHECK-NEXT: s_wait_dscnt 0x1
|
|
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[10:11]
|
|
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
|
|
; CHECK-NEXT: v_lshl_add_u32 v14, s1, 5, v13
|
|
; CHECK-NEXT: s_cmp_lt_i32 s1, s0
|
|
; CHECK-NEXT: s_wait_dscnt 0x0
|
|
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
|
|
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[0:1]
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[6:7]
|
|
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[4:5]
|
|
; CHECK-NEXT: s_barrier_wait -1
|
|
; CHECK-NEXT: ds_load_b128 v[8:11], v14
|
|
; CHECK-NEXT: ds_load_b128 v[0:3], v14 offset:16
|
|
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
|
|
; CHECK-NEXT: ; %bb.2: ; %exit
|
|
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
|
; CHECK-NEXT: s_wait_kmcnt 0x0
|
|
; CHECK-NEXT: global_store_b128 v12, v[4:7], s[0:1] scale_offset
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%base = shl i32 %tid, 4
|
|
|
|
; Initial prefetch loads in preheader
|
|
%ptr.init = getelementptr <4 x float>, ptr addrspace(3) %lds, i32 %base
|
|
%init.v1 = load <4 x float>, ptr addrspace(3) %ptr.init, align 16
|
|
%ptr.init2 = getelementptr <4 x float>, ptr addrspace(3) %lds, i32 %base, i32 4
|
|
%init.v2 = load <4 x float>, ptr addrspace(3) %ptr.init2, align 16
|
|
|
|
br label %loop
|
|
|
|
loop:
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
|
|
%acc = phi <4 x float> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
|
|
; These hold values loaded in previous iteration (prefetch pattern)
|
|
%prefetch1 = phi <4 x float> [ %init.v1, %entry ], [ %next.v1, %loop ]
|
|
%prefetch2 = phi <4 x float> [ %init.v2, %entry ], [ %next.v2, %loop ]
|
|
|
|
; Use prefetched values from previous iteration
|
|
%use1 = fadd <4 x float> %acc, %prefetch1
|
|
%use2 = fadd <4 x float> %use1, %prefetch2
|
|
|
|
; Barrier
|
|
call void @llvm.amdgcn.s.barrier()
|
|
|
|
; Compute next iteration's address
|
|
%next.i = add i32 %i, 1
|
|
%next.off = mul i32 %next.i, 2
|
|
%next.base = add i32 %base, %next.off
|
|
|
|
; Prefetch loads for NEXT iteration (after barrier)
|
|
; These are NOT used in the same iteration - pure prefetch pattern
|
|
%ptr.next1 = getelementptr <4 x float>, ptr addrspace(3) %lds, i32 %next.base
|
|
%next.v1 = load <4 x float>, ptr addrspace(3) %ptr.next1, align 16
|
|
|
|
%ptr.next2 = getelementptr <4 x float>, ptr addrspace(3) %lds, i32 %next.base, i32 4
|
|
%next.v2 = load <4 x float>, ptr addrspace(3) %ptr.next2, align 16
|
|
|
|
%acc.next = fadd <4 x float> %use2, %use2
|
|
|
|
%i.next = add i32 %i, 1
|
|
%cond = icmp slt i32 %i.next, %n
|
|
br i1 %cond, label %loop, label %exit
|
|
|
|
exit:
|
|
%out.ptr = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
|
|
store <4 x float> %acc.next, ptr addrspace(1) %out.ptr, align 16
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
declare void @llvm.amdgcn.s.barrier()
|