The fix does the following in expandPartwordCmpXchg and insertRMWCmpXchgLoop. - Issues volatile operations in the emulation loops if the original operation is volatile. - A preheader load is used for initializing the "cmp" and "new" values of the cmpxchg in the loop. Makes this load atomic. This is done under a target hook (`issueAtomicInitLoadForAtomicEmulation()`) , to allow backends to migrate independently. - `processAtomicInstr` is called on this load, to massage it into something that can be lowered in SelectionDAG / GISel. - This caused 3 kinds of failures. 1. Caused by change to codegen: updated these either using the scripts, or mechanically (using claude) to match the new codegen. 2. Crashes caused by newly created atomic loads not being processed by AtomicExpandPass. (The atomic load if tested in an independent test does not cause a crash). To fix these, added recursive calls to processAtomicInstr on the newly created atomic loads. These calls convert the loads to libcalls, or cast them to integer types. 3. Crashes in X86, AMDGPU, and AArch64 caused by unhandled vector types. These loads crash even with upstream LLVM, due to the lack of support in these targets for vector atomic loads (the corresponding vector atomicrmw instructions are supported). Disabled issuing atomic loads for these backends. Will follow up with individual PRs to revert to default behavior.
9381 lines
379 KiB
LLVM
9381 lines
379 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_add_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_add_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_add_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_add_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_add_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_add_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_add_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_add_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_add_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_and_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_and_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_and_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_and_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_and_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v5, s5, v7
|
|
; GFX7-NEXT: v_and_b32_e32 v4, s4, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v5, s5, v7
|
|
; GFX8-NEXT: v_and_b32_e32 v4, s4, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_and_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_and_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_and_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_and_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_sub_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_sub_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_sub_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_sub_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_sub_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v2
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_max_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_max_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_max_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_max_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_umax_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB34_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_umax_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB36_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB37_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB38_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB39_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_min_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB40_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_min_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB41_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB42_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_min_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB44_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_min_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB45_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_umin_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_umin_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB49_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_umin_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_umin_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umin_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SE
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_or_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_or_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_or_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_or_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB60_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_or_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_or_b32_e32 v5, s5, v7
|
|
; GFX7-NEXT: v_or_b32_e32 v4, s4, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_or_b32_e32 v5, s5, v7
|
|
; GFX8-NEXT: v_or_b32_e32 v4, s4, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_or_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB62_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB62_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_or_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB63_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_or_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB63_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_or_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_xchg_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
|
|
; GFX7-LABEL: atomic_xchg_f64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_f64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds double, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
|
|
; GFX7-LABEL: atomic_xchg_pointer_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_pointer_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_pointer_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds ptr, ptr %out, i32 4
|
|
%val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_xchg_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_xchg_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xchg_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_xor_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB74_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB74_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_xor_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB75_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB75_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB76_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB76_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB77_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB77_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_xor_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB78_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB78_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_xor_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, s5, v7
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, s4, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB79_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, s5, v7
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, s4, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB79_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xor_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
|
|
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_load_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %in, i64 4
|
|
%val = load atomic i64, ptr %gep seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_load_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_load_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %in, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%val = load atomic i64, ptr %gep seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_load_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %in, i64 %index
|
|
%val = load atomic i64, ptr %ptr seq_cst, align 8
|
|
store i64 %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_store_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: s_add_u32 s0, s2, 32
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: s_add_u32 s0, s2, 32
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
store atomic i64 %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_store_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
store atomic i64 %in, ptr %out seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_store_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
store atomic i64 %in, ptr %gep seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_store_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
store atomic i64 %in, ptr %ptr seq_cst, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 0x11940
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 0x11940
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 9000
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: s_add_u32 s0, s8, s2
|
|
; GFX7-NEXT: s_addc_u32 s3, s9, s3
|
|
; GFX7-NEXT: s_add_u32 s2, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s3, s3, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s12
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s13
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: s_add_u32 s0, s8, s2
|
|
; GFX8-NEXT: s_addc_u32 s3, s9, s3
|
|
; GFX8-NEXT: s_add_u32 s2, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s3, s3, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s13
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
|
|
; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX7-NEXT: s_add_u32 s2, s8, s2
|
|
; GFX7-NEXT: s_addc_u32 s3, s9, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s12
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s13
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX8-NEXT: s_add_u32 s2, s8, s2
|
|
; GFX8-NEXT: s_addc_u32 s3, s9, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s13
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3]
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
|
|
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
|
|
%extract0 = extractvalue { i64, i1 } %val, 0
|
|
store i64 %extract0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_load_f64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_f64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds double, ptr %in, i64 4
|
|
%val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_load_f64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_f64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_load_f64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_f64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds double, ptr %in, i64 %index
|
|
%gep = getelementptr inbounds double, ptr %ptr, i64 4
|
|
%val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_load_f64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_load_f64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_load_f64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds double, ptr %in, i64 %index
|
|
%val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0
|
|
store double %val, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_store_f64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: s_add_u32 s0, s2, 32
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_f64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: s_add_u32 s0, s2, 32
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds double, ptr %out, i64 4
|
|
store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
|
|
; GFX7-LABEL: atomic_store_f64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_f64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
store atomic double %in, ptr %out seq_cst, align 8, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_store_f64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_f64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_addr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds double, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds double, ptr %ptr, i64 4
|
|
store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) {
|
|
; GFX7-LABEL: atomic_store_f64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_store_f64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_store_f64_addr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_wait_storecnt 0x0
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds double, ptr %out, i64 %index
|
|
store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_inc_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_inc_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_inc_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_inc_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_inc_i64_incr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_incr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_incr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_dec_i64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_dec_i64_ret_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s8, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s9, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_ret_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s0, s8, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s9, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%gep = getelementptr inbounds i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s9
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s8
|
|
; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s9
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s8
|
|
; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_dec_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
|
|
; GFX7-LABEL: atomic_dec_i64_ret:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
|
|
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
|
; GFX7-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_ret:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
|
|
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; GFX8-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_dec_i64_decr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_decr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_decr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_clause 0x1
|
|
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX7-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s9
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s8
|
|
; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s4, s0
|
|
; GFX8-NEXT: s_addc_u32 s1, s5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s9
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s8
|
|
; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
|
|
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
|
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
!0 = !{i32 5, i32 6}
|