ASYNCMARK emits no hardware code it is used for tracking purpose but was not marked as meta, causing getNumWaitStates to return 1 and GCNHazardRecognizer to incorrectly count it as a pipeline cycle. This patch marks ASYNCMARK as meta-Instruction so it correctly reports 0 wait states. Fixes: #186878
116 lines
5.6 KiB
LLVM
116 lines
5.6 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
|
|
|
define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: raw.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: raw.ptr.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @struct.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: struct.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 8
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @struct.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: struct.ptr.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 8
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|