Files
llvm-project/llvm/test/CodeGen/AMDGPU/function-alignment.ll
michaelselehov cb3fbe921b [AMDGPU] Set preferred function alignment based on icache geometry (#183064)
Non-entry functions were unconditionally aligned to 4 bytes with no
architecture-specific preferred alignment, and setAlignment() was used
instead of ensureAlignment(), overwriting any explicit IR attributes.

Add instruction cache line size and fetch alignment data to GCNSubtarget
for each generation (GFX9: 64B/32B, GFX10: 64B/4B, GFX11+: 128B/4B). Use
this to call setPrefFunctionAlignment() in SITargetLowering, aligning
non-entry functions to the cache line size by default. Change
setAlignment to ensureAlignment in AMDGPUAsmPrinter so explicit IR align
attributes are respected.

Empirical thread trace analysis on gfx942, gfx1030, gfx1100, and gfx1200
showed that only GFX9 exhibits measurable fetch stalls when functions
cross the 32-byte fetch window boundary. GFX10+ showed no alignment
sensitivity. A hidden option -amdgpu-align-functions-for-fetch-only is
provided to use the fetch granularity instead of cache line size.

Assisted-by: Claude Opus
2026-03-11 07:57:37 -04:00

117 lines
4.0 KiB
LLVM

; Test preferred alignment of non-entry functions on different AMDGPU
; architectures. Preferred alignment matches the instruction cache line size:
;
; GFX9 - cache line = 64B (.p2align 6)
; GFX10 - cache line = 64B (.p2align 6)
; GFX11 - cache line = 128B (.p2align 7)
; GFX12 - cache line = 128B (.p2align 7)
; --- Default (cache line alignment) ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; --- Optsize: alignment drops to minimum (Align(4) = .p2align 2) ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=OPTSIZE %s
; --- IR align attribute: ensureAlignment must not lower explicit alignment ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=EXPLICIT-ALIGN %s
; --- -align-all-functions=1 with optsize: verify floor at Align(4) ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -align-all-functions=1 < %s | FileCheck -check-prefix=ALIGN-ALL %s
; --- prefalign attribute: overrides target preferred alignment ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=PREFALIGN %s
; --- Entry function: 256B alignment unchanged ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=ENTRY %s
; Non-entry function: alignment matches instruction cache line size.
define void @non_entry_func() {
; GFX9: .p2align 6{{$}}
; GFX9: non_entry_func:
; GFX10: .p2align 6{{$}}
; GFX10: non_entry_func:
; GFX11: .p2align 7{{$}}
; GFX11: non_entry_func:
; GFX12: .p2align 7{{$}}
; GFX12: non_entry_func:
ret void
}
; Non-entry function with optsize: must still be at least Align(4).
define void @optsize_func() optsize {
; OPTSIZE: .globl optsize_func
; OPTSIZE-NEXT: .p2align 2{{$}}
ret void
}
; Non-entry function with explicit IR align 128: ensureAlignment must not lower
; it. On GFX9 default is 64 (cache line), so 128 from IR must be preserved.
define void @explicit_align_func() align 128 {
; EXPLICIT-ALIGN: .globl explicit_align_func
; EXPLICIT-ALIGN-NEXT: .p2align 7{{$}}
ret void
}
; Non-entry function with explicit IR align 32 on gfx900 -- lower than
; preferred (64), so preferred alignment wins. Result: .p2align 6.
define void @low_align_func() align 32 {
; GFX9: .globl low_align_func
; GFX9-NEXT: .p2align 6{{$}}
ret void
}
; Optsize + -align-all-functions=1: MachineFunction::init sets Align(2), but
; ensureAlignment(4) in AsmPrinter restores the floor. With optsize,
; getPreferredAlignment returns max(Align(1), Align(4)) = Align(4).
define void @align_all_optsize_func() optsize {
; ALIGN-ALL: .globl align_all_optsize_func
; ALIGN-ALL-NEXT: .p2align 2{{$}}
ret void
}
; prefalign(16) on gfx900 overrides target preferred (64) with 16.
; getPreferredAlignment uses prefalign directly instead of getPrefFunctionAlignment.
; Result: max(16, 4) = 16 -> .p2align 4.
define void @prefalign_low_func() prefalign(16) {
; PREFALIGN: .globl prefalign_low_func
; PREFALIGN-NEXT: .p2align 4{{$}}
ret void
}
; prefalign(256) on gfx900 -- higher than target preferred (64).
; Result: max(256, 4) = 256 -> .p2align 8.
define void @prefalign_high_func() prefalign(256) {
; PREFALIGN: .globl prefalign_high_func
; PREFALIGN-NEXT: .p2align 8{{$}}
ret void
}
; prefalign(2) on gfx900 -- below the 4-byte instruction alignment floor.
; ensureAlignment(4) in AsmPrinter guarantees the minimum.
; Result: max(2, 4) = 4 -> .p2align 2.
define void @prefalign_floor_func() prefalign(2) {
; PREFALIGN: .globl prefalign_floor_func
; PREFALIGN-NEXT: .p2align 2{{$}}
ret void
}
; Entry function: must be 256B aligned regardless of our changes.
define amdgpu_kernel void @entry_func() {
; ENTRY: .globl entry_func
; ENTRY-NEXT: .p2align 8{{$}}
ret void
}