Non-entry functions were unconditionally aligned to 4 bytes with no architecture-specific preferred alignment, and setAlignment() was used instead of ensureAlignment(), overwriting any explicit IR attributes. Add instruction cache line size and fetch alignment data to GCNSubtarget for each generation (GFX9: 64B/32B, GFX10: 64B/4B, GFX11+: 128B/4B). Use this to call setPrefFunctionAlignment() in SITargetLowering, aligning non-entry functions to the cache line size by default. Change setAlignment to ensureAlignment in AMDGPUAsmPrinter so explicit IR align attributes are respected. Empirical thread trace analysis on gfx942, gfx1030, gfx1100, and gfx1200 showed that only GFX9 exhibits measurable fetch stalls when functions cross the 32-byte fetch window boundary. GFX10+ showed no alignment sensitivity. A hidden option -amdgpu-align-functions-for-fetch-only is provided to use the fetch granularity instead of cache line size. Assisted-by: Claude Opus
70 lines
3.1 KiB
LLVM
70 lines
3.1 KiB
LLVM
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END-ASM %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END-OBJ %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END-ASM %s
|
|
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND %s
|
|
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
|
|
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX11END-ASM %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1100 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX11END-OBJ %s
|
|
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX90AEND-ASM %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx90a --disassemble - | FileCheck -check-prefixes=GCN,GCN-OBJ,GFX90AEND-OBJ %s
|
|
|
|
; GCN: a_kernel1{{>?}}:
|
|
; GCN: s_endpgm
|
|
; GCN-ASM: [[END_LABEL1:\.Lfunc_end.*]]:
|
|
; GCN-ASM-NEXT: .size a_kernel1, [[END_LABEL1]]-a_kernel1
|
|
|
|
; GCN-OBJ-NEXT: s_nop 0
|
|
|
|
define amdgpu_kernel void @a_kernel1() #0 {
|
|
ret void
|
|
}
|
|
|
|
; GCN: a_kernel2{{>?}}:
|
|
; GCN: s_endpgm
|
|
; GCN-ASM: [[END_LABEL2:\.Lfunc_end.*]]:
|
|
; GCN-ASM-NEXT: .size a_kernel2, [[END_LABEL2]]-a_kernel2
|
|
|
|
; GCN-OBJ: {{^$}}
|
|
|
|
define amdgpu_kernel void @a_kernel2() #0 {
|
|
ret void
|
|
}
|
|
|
|
; GCN-ASM: .globl a_function
|
|
; GCN-ASM-NEXT: .p2align {{[67]}}
|
|
; GCN-ASM-NEXT: .type a_function,@function
|
|
|
|
; GCN-NEXT: a_function{{>?}}:
|
|
; GCN: s_setpc_b64
|
|
; GCN-ASM-NEXT: [[END_LABEL3:\.Lfunc_end.*]]:
|
|
; GCN-ASM-NEXT: .size a_function, [[END_LABEL3]]-a_function
|
|
; GFX10END-ASM: .p2alignl 6, 3214868480
|
|
; GFX11END-ASM: .p2alignl 7, 3214868480
|
|
; GFX90AEND-ASM: .p2alignl 6, 3212836864
|
|
; GFX10END-ASM-NEXT: .fill 48, 4, 3214868480
|
|
; GFX11END-ASM-NEXT: .fill 96, 4, 3214868480
|
|
; GFX90AEND-ASM-NEXT: .fill 256, 4, 3212836864
|
|
; GFX10NOEND-NOT: .fill
|
|
; GFX11NOEND-NOT: .fill
|
|
|
|
; GFX10NOEND-OBJ-NOT: s_code_end
|
|
; GFX10END-OBJ-NEXT: s_code_end
|
|
; GFX11NOEND-OBJ-NOT: s_code_end
|
|
; GFX11END-OBJ-NEXT: s_code_end
|
|
; GFX90AEND-OBJ-NEXT: s_nop 0
|
|
|
|
; GFX10END-OBJ: s_code_end // {{[0-9A-F]+}}:
|
|
; GFX10END-OBJ-COUNT-47: s_code_end
|
|
; GFX11END-OBJ: s_code_end // {{[0-9A-F]+}}:
|
|
; GFX11END-OBJ-COUNT-47: s_code_end
|
|
; GFX90AEND-OBJ: s_nop 0 // {{[0-9A-F]+}}:
|
|
; GFX90AEND-OBJ-COUNT-255: s_nop 0
|
|
|
|
define void @a_function() #0 {
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,512" }
|