Keep bf16/f16 values encoded as the low half of a 32-bit register, instead of promoting to float. This avoids unwanted FP effects from the fpext/fptrunc which should not be implied by just passing an argument. This also fixes ABI divergence between SelectionDAG and GlobalISel. I've wanted to make this change for ages, and failed the last few times. The main complication was the hack to return shader integer types in SGPRs, which now needs to inspect the underlying IR type.
202 lines
7.7 KiB
LLVM
202 lines
7.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX12 %s
|
|
|
|
define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
|
|
; GFX7-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: raw_ptr_buffer_load_bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: buffer_load_u16 v0, off, s[0:3], null
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: s_set_pc_i64 s[30:31]
|
|
%val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
|
ret bfloat %val
|
|
}
|
|
|
|
define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
|
|
; GFX7-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: raw_ptr_buffer_load_v2bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: s_set_pc_i64 s[30:31]
|
|
%val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
|
ret <2 x bfloat> %val
|
|
}
|
|
|
|
define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
|
|
; GFX7-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: raw_ptr_buffer_load_v4bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: s_set_pc_i64 s[30:31]
|
|
%val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
|
ret <4 x bfloat> %val
|
|
}
|
|
|
|
; FIXME
|
|
; define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) {
|
|
; %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
|
; ret <6 x bfloat> %val
|
|
; }
|
|
|
|
define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
|
|
; GFX7-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: raw_ptr_buffer_load_v8bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: s_set_pc_i64 s[30:31]
|
|
%val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
|
ret <8 x bfloat> %val
|
|
}
|