Files
llvm-project/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
Akshay Deodhar 184f236a18 [AtomicExpandPass] Preserve atomic and volatile nature of emulated operations (#188361)
The fix does the following in expandPartwordCmpXchg and
insertRMWCmpXchgLoop.

- Issues volatile operations in the emulation loops if the original
operation is volatile.
- A preheader load is used for initializing the "cmp" and "new" values
of the cmpxchg in the loop. Makes this load atomic. This is done under a
target hook (`issueAtomicInitLoadForAtomicEmulation()`) , to allow
backends to migrate independently.
- `processAtomicInstr` is called on this load, to massage it into
something that can be lowered in SelectionDAG / GISel.
- This caused 3 kinds of failures.

1. Caused by change to codegen: updated these either using the scripts,
or mechanically (using claude) to match the new codegen.
2. Crashes caused by newly created atomic loads not being processed by
AtomicExpandPass. (The atomic load if tested in an independent test does
not cause a crash). To fix these, added recursive calls to
processAtomicInstr on the newly created atomic loads. These calls
convert the loads to libcalls, or cast them to integer types.
3. Crashes in X86, AMDGPU, and AArch64 caused by unhandled vector types.
These loads crash even with upstream LLVM, due to the lack of support in
these targets for vector atomic loads (the corresponding vector
atomicrmw instructions are supported). Disabled issuing atomic loads for
these backends. Will follow up with individual PRs to revert to default
behavior.
2026-04-30 09:31:39 -07:00

9381 lines
379 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_add_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_add_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_add_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_add_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_add_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_add_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_add_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_add_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_and_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_and_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_and_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_and_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: v_and_b32_e32 v5, s5, v7
; GFX7-NEXT: v_and_b32_e32 v4, s4, v6
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_and_b32_e32 v5, s5, v7
; GFX8-NEXT: v_and_b32_e32 v4, s4, v6
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_sub_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_sub_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v8, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v2
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v8, v3
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v8, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v2
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v8, v3
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_sub_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_sub_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v8, v1
; GFX7-NEXT: v_mov_b32_e32 v7, v0
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v0
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB22_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v8, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v2
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v8, v3
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_max_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB24_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB24_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_max_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB25_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB25_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_max_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_max_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mov_b32_e32 v8, v0
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB29_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umax_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umax_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB34_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umax_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB36_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umax_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mov_b32_e32 v8, v0
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB37_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB38_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB39_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_min_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB40_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_min_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB41_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB42_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB43_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_min_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB44_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_min_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mov_b32_e32 v8, v0
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB45_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umin_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umin_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB49_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB50_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB51_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umin_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB52_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umin_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mov_b32_e32 v8, v0
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB53_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB55_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_or_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB56_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_or_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_or_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB60_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_or_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: v_or_b32_e32 v5, s5, v7
; GFX7-NEXT: v_or_b32_e32 v4, s4, v6
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_or_b32_e32 v5, s5, v7
; GFX8-NEXT: v_or_b32_e32 v4, s4, v6
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB62_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB62_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB63_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB63_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_xchg_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GFX7-LABEL: atomic_xchg_f64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_f64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds double, ptr %out, i64 4
%tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GFX7-LABEL: atomic_xchg_pointer_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_pointer_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds ptr, ptr %out, i32 4
%val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_xchg_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xchg_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_xchg_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_xchg_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xchg_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xchg_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xchg_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_xor_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB74_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB74_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_xor_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB75_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB75_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB76_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB76_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB77_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB77_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_xor_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB78_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB78_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_xor_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: v_xor_b32_e32 v5, s5, v7
; GFX7-NEXT: v_xor_b32_e32 v4, s4, v6
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB79_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_xor_b32_e32 v5, s5, v7
; GFX8-NEXT: v_xor_b32_e32 v4, s4, v6
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB79_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB80_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB80_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB81_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB81_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
; GFX7-LABEL: atomic_load_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %in, i64 4
%val = load atomic i64, ptr %gep seq_cst, align 8
store i64 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
; GFX7-LABEL: atomic_load_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
store i64 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_load_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %in, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%val = load atomic i64, ptr %gep seq_cst, align 8
store i64 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_load_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %in, i64 %index
%val = load atomic i64, ptr %ptr seq_cst, align 8
store i64 %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
; GFX7-LABEL: atomic_store_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_add_u32 s0, s2, 32
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_addc_u32 s1, s3, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_add_u32 s0, s2, 32
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
store atomic i64 %in, ptr %gep seq_cst, align 8
ret void
}
define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
; GFX7-LABEL: atomic_store_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
store atomic i64 %in, ptr %out seq_cst, align 8
ret void
}
define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_store_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s2, s0
; GFX7-NEXT: s_addc_u32 s1, s3, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s2, s0
; GFX8-NEXT: s_addc_u32 s1, s3, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
store atomic i64 %in, ptr %gep seq_cst, align 8
ret void
}
define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_store_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s2, s0
; GFX7-NEXT: s_addc_u32 s1, s3, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s2, s0
; GFX8-NEXT: s_addc_u32 s1, s3, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
store atomic i64 %in, ptr %ptr seq_cst, align 8
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_soffset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 0x11940
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_soffset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 0x11940
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 9000
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
%extract0 = extractvalue { i64, i1 } %val, 0
store i64 %extract0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_add_u32 s0, s8, s2
; GFX7-NEXT: s_addc_u32 s3, s9, s3
; GFX7-NEXT: s_add_u32 s2, s0, 32
; GFX7-NEXT: s_addc_u32 s3, s3, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s11
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_add_u32 s0, s8, s2
; GFX8-NEXT: s_addc_u32 s3, s9, s3
; GFX8-NEXT: s_add_u32 s2, s0, 32
; GFX8-NEXT: s_addc_u32 s3, s3, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
%extract0 = extractvalue { i64, i1 } %val, 0
store i64 %extract0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
%extract0 = extractvalue { i64, i1 } %val, 0
store i64 %extract0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX7-NEXT: s_add_u32 s2, s8, s2
; GFX7-NEXT: s_addc_u32 s3, s9, s3
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s11
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX8-NEXT: s_add_u32 s2, s8, s2
; GFX8-NEXT: s_addc_u32 s3, s9, s3
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0
%extract0 = extractvalue { i64, i1 } %val, 0
store i64 %extract0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
; GFX7-LABEL: atomic_load_f64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_f64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds double, ptr %in, i64 4
%val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0
store double %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
; GFX7-LABEL: atomic_load_f64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_f64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store double %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_load_f64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_f64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds double, ptr %in, i64 %index
%gep = getelementptr inbounds double, ptr %ptr, i64 4
%val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0
store double %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_load_f64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_load_f64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds double, ptr %in, i64 %index
%val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0
store double %val, ptr %out
ret void
}
define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
; GFX7-LABEL: atomic_store_f64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_add_u32 s0, s2, 32
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_addc_u32 s1, s3, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_f64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_add_u32 s0, s2, 32
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds double, ptr %out, i64 4
store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
; GFX7-LABEL: atomic_store_f64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_f64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
store atomic double %in, ptr %out seq_cst, align 8, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_store_f64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s2, s0
; GFX7-NEXT: s_addc_u32 s1, s3, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_f64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s2, s0
; GFX8-NEXT: s_addc_u32 s1, s3, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds double, ptr %out, i64 %index
%gep = getelementptr inbounds double, ptr %ptr, i64 4
store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) {
; GFX7-LABEL: atomic_store_f64_addr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s2, s0
; GFX7-NEXT: s_addc_u32 s1, s3, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_store_f64_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s2, s0
; GFX8-NEXT: s_addc_u32 s1, s3, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds double, ptr %out, i64 %index
store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_inc_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB107_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB107_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_inc_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB108_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB108_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_incr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB109_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_incr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB109_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB110_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB110_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_inc_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB111_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB111_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_inc_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB112_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB112_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_incr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB113_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_incr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB113_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_incr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_ret_incr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s6
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB114_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_incr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s6
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB114_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s4, 32
; GFX7-NEXT: s_addc_u32 s1, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB115_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB115_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s8, 32
; GFX7-NEXT: s_addc_u32 s1, s9, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB116_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, s10
; GFX7-NEXT: v_mov_b32_e32 v1, s11
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s8, 32
; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB116_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr inbounds i64, ptr %out, i64 4
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_decr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX7-NEXT: s_add_u32 s0, s4, s0
; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB117_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_decr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX8-NEXT: s_add_u32 s0, s4, s0
; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB117_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX7-NEXT: s_add_u32 s0, s4, s0
; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: v_mov_b32_e32 v5, s8
; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB118_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX8-NEXT: s_add_u32 s0, s4, s0
; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB118_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%gep = getelementptr inbounds i64, ptr %ptr, i64 4
%tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_dec_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB119_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB119_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB120_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, s10
; GFX7-NEXT: v_mov_b32_e32 v1, s11
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB120_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_decr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX7-NEXT: s_add_u32 s0, s4, s0
; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB121_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_decr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX8-NEXT: s_add_u32 s0, s4, s0
; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB121_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_decr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
ret void
}
define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_ret_decr64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX7-NEXT: s_add_u32 s0, s4, s0
; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: v_mov_b32_e32 v5, s8
; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v9, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v2
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB122_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_decr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX8-NEXT: s_add_u32 s0, s4, s0
; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB122_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr inbounds i64, ptr %out, i64 %index
%tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
store i64 %tmp0, ptr %out2
ret void
}
!0 = !{i32 5, i32 6}