`global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton kernels, similar to the added lit tests. Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like `SIInsertWaitcnts` does not use `LGKM_CNT` for buffer ops. But this change might prevent a bug in the future.
439 lines
18 KiB
LLVM
439 lines
18 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s
|
|
|
|
; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
|
|
; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
|
|
; similar transformations in that pass.
|
|
|
|
; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
|
|
define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
|
|
; GFX942-LABEL: global_load_ZTwoUses:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
|
|
; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
|
|
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
|
|
%l = load i64, ptr addrspace(1) %gep1, align 8
|
|
%r = add i64 %l, %voffset
|
|
ret i64 %r
|
|
}
|
|
|
|
define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
|
|
; GFX942-LABEL: global_load_gep_add_reassoc:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
|
|
; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%add0 = add nuw nsw i64 %voffset, 24
|
|
%gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
|
|
%l = load i64, ptr addrspace(1) %gep0, align 8
|
|
ret i64 %l
|
|
}
|
|
|
|
; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
|
|
; would be folded away in most cases, but the index computation introduced by
|
|
; the legalization of wide vector stores can for example introduce them.
|
|
define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
|
|
; GFX942-LABEL: store_v16i32:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s20
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, s21
|
|
; GFX942-NEXT: v_mov_b32_e32 v4, s22
|
|
; GFX942-NEXT: v_mov_b32_e32 v5, s23
|
|
; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
|
|
; GFX942-NEXT: s_nop 1
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s16
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, s17
|
|
; GFX942-NEXT: v_mov_b32_e32 v4, s18
|
|
; GFX942-NEXT: v_mov_b32_e32 v5, s19
|
|
; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
|
|
; GFX942-NEXT: s_nop 1
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s12
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, s13
|
|
; GFX942-NEXT: v_mov_b32_e32 v4, s14
|
|
; GFX942-NEXT: v_mov_b32_e32 v5, s15
|
|
; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
|
|
; GFX942-NEXT: s_nop 1
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, s9
|
|
; GFX942-NEXT: v_mov_b32_e32 v4, s10
|
|
; GFX942-NEXT: v_mov_b32_e32 v5, s11
|
|
; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
entry:
|
|
store <16 x i32> %a, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
|
|
; Tests the (ptradd 0, x) -> x DAG combine.
|
|
define void @baseptr_null(i64 %offset, i8 %v) {
|
|
; GFX942-LABEL: baseptr_null:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: flat_store_byte v[0:1], v2
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i8, ptr null, i64 %offset
|
|
store i8 %v, ptr %gep, align 1
|
|
ret void
|
|
}
|
|
|
|
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
|
|
; assertalign DAG combine.
|
|
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
|
|
; GFX942-LABEL: llvm_amdgcn_queue_ptr:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: global_load_ubyte v1, v0, s[2:3] sc0 sc1
|
|
; GFX942-NEXT: global_load_ubyte v1, v0, s[4:5] offset:8 sc0 sc1
|
|
; GFX942-NEXT: global_load_ubyte v1, v0, s[0:1] sc0 sc1
|
|
; GFX942-NEXT: ; kill: killed $sgpr0_sgpr1
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: ; kill: killed $sgpr2_sgpr3
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: s_endpgm
|
|
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
|
|
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
%dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
|
|
%queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
|
|
%implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
|
|
%dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
|
|
store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
; Taken from memcpy-param-combinations.ll, tests PTRADD handling in
|
|
; SelectionDAGAddressAnalysis.
|
|
define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
|
|
; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
|
|
ret void
|
|
}
|
|
|
|
; Test skipping the lower-32-bit addition if it is unnecessary.
|
|
define ptr @huge_offset_low_32_unused(ptr %p) {
|
|
; GFX942-LABEL: huge_offset_low_32_unused:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
|
|
ret ptr %gep
|
|
}
|
|
|
|
; Reassociate address computation if it leads to more scalar operations.
|
|
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
|
|
; GFX942-LABEL: reassoc_scalar_r:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
|
; GFX942-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_add_u32 s2, s2, s6
|
|
; GFX942-NEXT: s_addc_u32 s3, s3, s7
|
|
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
|
|
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
entry:
|
|
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%voffset = zext i32 %voffset32 to i64
|
|
%offset = add nuw nsw i64 %voffset, %soffset
|
|
%gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset
|
|
store ptr addrspace(1) %gep, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
|
|
; GFX942-LABEL: reassoc_scalar_l:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
|
; GFX942-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_add_u32 s2, s2, s6
|
|
; GFX942-NEXT: s_addc_u32 s3, s3, s7
|
|
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
|
|
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
entry:
|
|
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%voffset = zext i32 %voffset32 to i64
|
|
%offset = add nuw nsw i64 %soffset, %voffset
|
|
%gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset
|
|
store ptr addrspace(1) %gep, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
|
|
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
|
|
; GFX942-LABEL: shl_neg_offset:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
|
|
; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
|
|
; GFX942-NEXT: s_nop 1
|
|
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%offset = sub i64 0, %noffset
|
|
%x = shl i64 %offset, %shift
|
|
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
|
|
ret ptr addrspace(1) %gep
|
|
}
|
|
|
|
%complextype = type { i64, [10 x i8], float }
|
|
|
|
@v0 = dso_local addrspace(1) global %complextype zeroinitializer
|
|
|
|
; Check that offsets are folded into global addresses if possible. For example,
|
|
; this is relevant when using --amdgpu-lower-module-lds-strategy=table.
|
|
define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
|
|
; GFX942-LABEL: complextype_global_gep:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: s_getpc_b64 s[0:1]
|
|
; GFX942-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
|
|
; GFX942-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset
|
|
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2
|
|
ret ptr addrspace(1) %gep1
|
|
}
|
|
|
|
%S = type <{ float, double }>
|
|
|
|
; Tests the tryFoldToMad64_32 PTRADD combine.
|
|
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
|
|
; GFX942-LABEL: fold_mad64:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
|
|
; GFX942-NEXT: global_store_dword v[0:1], v2, off
|
|
; GFX942-NEXT: s_endpgm
|
|
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%voffset = zext i32 %voffset32 to i64
|
|
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
|
|
store float 1.0, ptr addrspace(1) %p1
|
|
ret void
|
|
}
|
|
|
|
; Use non-zero shift amounts in v_lshl_add_u64.
|
|
define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
|
|
; GFX942-LABEL: select_v_lshl_add_u64:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr inbounds i64, ptr %base, i64 %voffset
|
|
ret ptr %gep
|
|
}
|
|
|
|
; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
|
|
; mul into a mul24.
|
|
define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
|
|
; GFX942-LABEL: fold_mul24_into_mad:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2
|
|
; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4
|
|
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%a_masked = and i64 %a, u0xfffff
|
|
%b_masked = and i64 %b, u0xfffff
|
|
%mul = mul i64 %a_masked, %b_masked
|
|
%gep = getelementptr inbounds i8, ptr %base, i64 %mul
|
|
ret ptr %gep
|
|
}
|
|
|
|
; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectGlobalSAddr.
|
|
define amdgpu_kernel void @uniform_base_varying_offset_imm(ptr addrspace(1) %p) {
|
|
; GFX942-LABEL: uniform_base_varying_offset_imm:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX942-NEXT: v_mov_b32_e32 v1, 1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_store_dword v0, v1, s[0:1] offset:16
|
|
; GFX942-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%shift = shl i32 %tid, 2
|
|
%voffset = zext i32 %shift to i64
|
|
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %voffset
|
|
%gep2 = getelementptr inbounds i8, ptr addrspace(1) %gep1, i64 16
|
|
store i32 1, ptr addrspace(1) %gep2
|
|
ret void
|
|
}
|
|
|
|
; Adjusted from global-saddr-load.ll. Tests PTRADD handling in
|
|
; AMDGPUDAGToDAGISel::SelectSMRDBaseOffset.
|
|
define amdgpu_kernel void @global_load_saddr_i32_uniform_offset(ptr addrspace(1) %sbase, i32 %soffset, ptr addrspace(1) %r) {
|
|
; GFX942-LABEL: global_load_saddr_i32_uniform_offset:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: s_load_dword s6, s[4:5], 0x8
|
|
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_load_dword s0, s[0:1], s6 offset:0x0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX942-NEXT: s_endpgm
|
|
%zext.offset = zext i32 %soffset to i64
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
|
|
%load = load i32, ptr addrspace(1) %gep0
|
|
%to.vgpr = bitcast i32 %load to float
|
|
store float %to.vgpr, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
; Adjusted from llvm.amdgcn.global.load.lds.ll, tests the offset lowering for
|
|
; Intrinsic::amdgcn_global_load_lds.
|
|
define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
|
|
; GFX942-LABEL: global_load_lds_dword_saddr_and_vaddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s2
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
%voffset.64 = zext i32 %voffset to i64
|
|
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
|
|
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 4, i32 48, i32 16)
|
|
ret void
|
|
}
|
|
|
|
; Taken from shl_add_ptr_global.ll, tests PTRADD handling in
|
|
; SITargetLowering::performSHLPtrCombine.
|
|
define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) {
|
|
; GFX942-LABEL: shl_base_global_ptr_global_atomic_fadd:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5]
|
|
; GFX942-NEXT: v_mov_b32_e32 v6, 0x42c80000
|
|
; GFX942-NEXT: global_atomic_add_f32 v[0:1], v6, off offset:512
|
|
; GFX942-NEXT: s_mov_b64 s[0:1], 0x80
|
|
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
|
|
; GFX942-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
|
|
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
|
|
%shl = shl i64 %cast, 2
|
|
%castback = inttoptr i64 %shl to ptr addrspace(1)
|
|
%unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
|
|
ret void
|
|
}
|
|
|
|
; Test PTRADD handling in TargetLowering::SimplifyDemandedBits and
|
|
; TargetLowering::ShrinkDemandedOp.
|
|
define i32 @gep_in_const_as_cast_to_const32_as(ptr addrspace(4) %src, i64 %offset) {
|
|
; GFX942-LABEL: gep_in_const_as_cast_to_const32_as:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_add_u32_e32 v0, v0, v2
|
|
; GFX942-NEXT: s_mov_b32 s1, 0
|
|
; GFX942-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX942-NEXT: s_load_dword s0, s[0:1], 0x0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%gep = getelementptr i8, ptr addrspace(4) %src, i64 %offset
|
|
%gep.cast = addrspacecast ptr addrspace(4) %gep to ptr addrspace(6)
|
|
%l = load i32, ptr addrspace(6) %gep.cast
|
|
ret i32 %l
|
|
}
|
|
|
|
@CG = addrspace(4) constant [16 x i32] zeroinitializer, align 4
|
|
|
|
; Test PTRADD handling in isMemSrcFromConstant.
|
|
define void @replace_const0_memcpy_by_memset(ptr align 4 %dst) {
|
|
; GFX942-LABEL: replace_const0_memcpy_by_memset:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%gep = getelementptr i8, ptr addrspace(4) @CG, i64 4
|
|
tail call void @llvm.memcpy.p0.p4.i64(ptr noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %gep, i64 8, i1 false)
|
|
ret void
|
|
}
|
|
|
|
; Check that ptradds can be lowered to disjoint ORs.
|
|
define ptr @gep_disjoint_or(ptr %base) {
|
|
; GFX942-LABEL: gep_disjoint_or:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_and_or_b32 v0, v0, -16, 4
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%p = call ptr @llvm.ptrmask(ptr %base, i64 s0xf0)
|
|
%gep = getelementptr nuw inbounds i8, ptr %p, i64 4
|
|
ret ptr %gep
|
|
}
|
|
|
|
; Check that AssertAlign nodes between ptradd nodes don't block offset folding,
|
|
; taken from preload-implicit-kernargs.ll
|
|
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
|
|
; GFX942-LABEL: random_incorrect_offset:
|
|
; GFX942: ; %bb.1:
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_branch .LBB21_0
|
|
; GFX942-NEXT: .p2align 8
|
|
; GFX942-NEXT: ; %bb.2:
|
|
; GFX942-NEXT: .LBB21_0:
|
|
; GFX942-NEXT: s_load_dword s0, s[4:5], 0xa
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX942-NEXT: global_store_dword v0, v1, s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
|
|
%load = load i32, ptr addrspace(4) %gep
|
|
store i32 %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0.p4.i64(ptr noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
|
|
|
|
!0 = !{}
|