Files
llvm-project/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
Akshay Deodhar 184f236a18 [AtomicExpandPass] Preserve atomic and volatile nature of emulated operations (#188361)
The fix does the following in expandPartwordCmpXchg and
insertRMWCmpXchgLoop.

- Issues volatile operations in the emulation loops if the original
operation is volatile.
- A preheader load is used for initializing the "cmp" and "new" values
of the cmpxchg in the loop. Makes this load atomic. This is done under a
target hook (`issueAtomicInitLoadForAtomicEmulation()`) , to allow
backends to migrate independently.
- `processAtomicInstr` is called on this load, to massage it into
something that can be lowered in SelectionDAG / GISel.
- This caused 3 kinds of failures.

1. Caused by change to codegen: updated these either using the scripts,
or mechanically (using claude) to match the new codegen.
2. Crashes caused by newly created atomic loads not being processed by
AtomicExpandPass. (The atomic load if tested in an independent test does
not cause a crash). To fix these, added recursive calls to
processAtomicInstr on the newly created atomic loads. These calls
convert the loads to libcalls, or cast them to integer types.
3. Crashes in X86, AMDGPU, and AArch64 caused by unhandled vector types.
These loads crash even with upstream LLVM, due to the lack of support in
these targets for vector atomic loads (the corresponding vector
atomicrmw instructions are supported). Disabled issuing atomic loads for
these backends. Will follow up with individual PRs to revert to default
behavior.
2026-04-30 09:31:39 -07:00

3112 lines
124 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: xchg_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<14>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r5, [xchg_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r6, %rd2;
; SM70-NEXT: and.b32 %r7, %r6, 3;
; SM70-NEXT: shl.b32 %r1, %r7, 3;
; SM70-NEXT: mov.b32 %r8, 255;
; SM70-NEXT: shl.b32 %r9, %r8, %r1;
; SM70-NEXT: not.b32 %r2, %r9;
; SM70-NEXT: shl.b32 %r3, %r5, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r13, [%rd1];
; SM70-NEXT: $L__BB0_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r13, %r2;
; SM70-NEXT: or.b32 %r11, %r10, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
; SM70-NEXT: setp.ne.b32 %p1, %r4, %r13;
; SM70-NEXT: mov.b32 %r13, %r4;
; SM70-NEXT: @%p1 bra $L__BB0_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r12, %r4, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r12;
; SM70-NEXT: ret;
%retval = atomicrmw xchg ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @xchg_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: xchg_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<14>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r5, [xchg_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r6, %rd2;
; SM70-NEXT: and.b32 %r7, %r6, 3;
; SM70-NEXT: shl.b32 %r1, %r7, 3;
; SM70-NEXT: mov.b32 %r8, 65535;
; SM70-NEXT: shl.b32 %r9, %r8, %r1;
; SM70-NEXT: not.b32 %r2, %r9;
; SM70-NEXT: shl.b32 %r3, %r5, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r13, [%rd1];
; SM70-NEXT: $L__BB1_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r13, %r2;
; SM70-NEXT: or.b32 %r11, %r10, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
; SM70-NEXT: setp.ne.b32 %p1, %r4, %r13;
; SM70-NEXT: mov.b32 %r13, %r4;
; SM70-NEXT: @%p1 bra $L__BB1_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r12, %r4, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r12;
; SM70-NEXT: ret;
%retval = atomicrmw xchg ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @xchg_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: xchg_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [xchg_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.exch.b32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw xchg ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @xchg_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: xchg_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.exch.b64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw xchg ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @add_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: add_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r6, [add_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB4_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB4_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @add_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: add_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r6, [add_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 65535;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB5_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB5_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @add_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: add_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [add_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.add.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @add_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: add_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.add.u64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @sub_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: sub_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r6, [sub_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB8_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: sub.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB8_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw sub ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @sub_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: sub_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r6, [sub_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 65535;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB9_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: sub.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB9_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw sub ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @sub_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: sub_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<4>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [sub_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: neg.s32 %r2, %r1;
; SM70-NEXT: atom.acq_rel.cta.global.add.u32 %r3, [%rd1], %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r3;
; SM70-NEXT: ret;
%retval = atomicrmw sub ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @sub_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: sub_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: neg.s64 %rd3, %rd2;
; SM70-NEXT: atom.acq_rel.cta.global.add.u64 %rd4, [%rd1], %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd4;
; SM70-NEXT: ret;
%retval = atomicrmw sub ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @and_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: and_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<12>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r1, [and_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: mov.b32 %r5, 255;
; SM70-NEXT: shl.b32 %r6, %r5, %r4;
; SM70-NEXT: not.b32 %r7, %r6;
; SM70-NEXT: shl.b32 %r8, %r1, %r4;
; SM70-NEXT: or.b32 %r9, %r8, %r7;
; SM70-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
; SM70-NEXT: shr.u32 %r11, %r10, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r11;
; SM70-NEXT: ret;
%retval = atomicrmw and ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @and_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: and_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<12>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r1, [and_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: mov.b32 %r5, 65535;
; SM70-NEXT: shl.b32 %r6, %r5, %r4;
; SM70-NEXT: not.b32 %r7, %r6;
; SM70-NEXT: shl.b32 %r8, %r1, %r4;
; SM70-NEXT: or.b32 %r9, %r8, %r7;
; SM70-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
; SM70-NEXT: shr.u32 %r11, %r10, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r11;
; SM70-NEXT: ret;
%retval = atomicrmw and ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @and_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: and_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [and_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.and.b32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw and ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @and_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: and_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [and_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.and.b64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw and ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @nand_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: nand_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r6, [nand_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB16_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB16_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @nand_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: nand_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r6, [nand_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 65535;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB17_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB17_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @nand_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: nand_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [nand_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [nand_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB18_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r3, %r5, %r2;
; SM70-NEXT: not.b32 %r4, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB18_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @nand_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: nand_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b64 %rd<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [nand_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd2];
; SM70-NEXT: $L__BB19_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b64 %rd4, %rd6, %rd3;
; SM70-NEXT: not.b64 %rd5, %rd4;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
; SM70-NEXT: mov.b64 %rd6, %rd1;
; SM70-NEXT: @%p1 bra $L__BB19_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @or_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: or_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<8>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r1, [or_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: shl.b32 %r5, %r1, %r4;
; SM70-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
; SM70-NEXT: shr.u32 %r7, %r6, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r7;
; SM70-NEXT: ret;
%retval = atomicrmw or ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @or_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: or_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<8>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r1, [or_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: shl.b32 %r5, %r1, %r4;
; SM70-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
; SM70-NEXT: shr.u32 %r7, %r6, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r7;
; SM70-NEXT: ret;
%retval = atomicrmw or ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @or_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: or_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [or_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.or.b32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw or ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @or_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: or_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [or_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.or.b64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw or ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @xor_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: xor_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<8>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r1, [xor_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: shl.b32 %r5, %r1, %r4;
; SM70-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
; SM70-NEXT: shr.u32 %r7, %r6, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r7;
; SM70-NEXT: ret;
%retval = atomicrmw xor ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @xor_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: xor_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<8>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b16 %r1, [xor_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: and.b64 %rd2, %rd1, -4;
; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
; SM70-NEXT: and.b32 %r3, %r2, 3;
; SM70-NEXT: shl.b32 %r4, %r3, 3;
; SM70-NEXT: shl.b32 %r5, %r1, %r4;
; SM70-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
; SM70-NEXT: shr.u32 %r7, %r6, %r4;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r7;
; SM70-NEXT: ret;
%retval = atomicrmw xor ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @xor_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: xor_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [xor_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.xor.b32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw xor ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @xor_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: xor_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [xor_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.xor.b64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw xor ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @max_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: max_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [max_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: cvt.s16.s8 %rs3, %rs1;
; SM70-NEXT: $L__BB28_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r15, %r1;
; SM70-NEXT: cvt.s8.s32 %rs2, %r8;
; SM70-NEXT: max.s16 %rs4, %rs2, %rs3;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: and.b32 %r10, %r9, 255;
; SM70-NEXT: shl.b32 %r11, %r10, %r1;
; SM70-NEXT: and.b32 %r12, %r15, %r2;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r15;
; SM70-NEXT: mov.b32 %r15, %r3;
; SM70-NEXT: @%p1 bra $L__BB28_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw max ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @max_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: max_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [max_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB29_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: max.s16 %rs3, %rs2, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB29_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw max ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @max_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: max_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [max_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [max_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.max.s32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw max ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @max_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: max_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [max_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.max.s64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw max ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @min_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: min_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [min_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: cvt.s16.s8 %rs3, %rs1;
; SM70-NEXT: $L__BB32_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r15, %r1;
; SM70-NEXT: cvt.s8.s32 %rs2, %r8;
; SM70-NEXT: min.s16 %rs4, %rs2, %rs3;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: and.b32 %r10, %r9, 255;
; SM70-NEXT: shl.b32 %r11, %r10, %r1;
; SM70-NEXT: and.b32 %r12, %r15, %r2;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r15;
; SM70-NEXT: mov.b32 %r15, %r3;
; SM70-NEXT: @%p1 bra $L__BB32_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw min ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @min_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: min_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [min_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB33_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: min.s16 %rs3, %rs2, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB33_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw min ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @min_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: min_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [min_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [min_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.min.s32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw min ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @min_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: min_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [min_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.min.s64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw min ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @umax_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: umax_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [umax_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB36_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: max.u16 %rs4, %rs3, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB36_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw umax ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @umax_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: umax_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [umax_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB37_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: max.u16 %rs3, %rs2, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB37_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw umax ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @umax_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: umax_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [umax_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.max.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw umax ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @umax_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: umax_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.max.u64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw umax ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @umin_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: umin_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [umin_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB40_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: min.u16 %rs4, %rs3, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB40_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw umin ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @umin_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: umin_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [umin_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB41_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: min.u16 %rs3, %rs2, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB41_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw umin ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @umin_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: umin_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [umin_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.min.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw umin ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @umin_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: umin_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.min.u64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw umin ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @uinc_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: uinc_wrap_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<6>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [uinc_wrap_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB44_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r15, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: add.s16 %rs4, %rs2, 1;
; SM70-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
; SM70-NEXT: selp.b16 %rs5, 0, %rs4, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
; SM70-NEXT: and.b32 %r10, %r9, 255;
; SM70-NEXT: shl.b32 %r11, %r10, %r1;
; SM70-NEXT: and.b32 %r12, %r15, %r2;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r15;
; SM70-NEXT: mov.b32 %r15, %r3;
; SM70-NEXT: @%p2 bra $L__BB44_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @uinc_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: uinc_wrap_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [uinc_wrap_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB45_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: add.s16 %rs3, %rs2, 1;
; SM70-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs4, 0, %rs3, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p2 bra $L__BB45_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @uinc_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: uinc_wrap_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [uinc_wrap_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [uinc_wrap_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.inc.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @uinc_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: uinc_wrap_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b64 %rd<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [uinc_wrap_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd2];
; SM70-NEXT: $L__BB47_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s64 %rd4, %rd6, 1;
; SM70-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
; SM70-NEXT: selp.b64 %rd5, 0, %rd4, %p1;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
; SM70-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
; SM70-NEXT: mov.b64 %rd6, %rd1;
; SM70-NEXT: @%p2 bra $L__BB47_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @udec_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: udec_wrap_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<4>;
; SM70-NEXT: .reg .b16 %rs<7>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [udec_wrap_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB48_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r15, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: add.s16 %rs4, %rs2, -1;
; SM70-NEXT: setp.eq.b16 %p1, %rs3, 0;
; SM70-NEXT: setp.gt.u16 %p2, %rs3, %rs1;
; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p2;
; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
; SM70-NEXT: and.b32 %r10, %r9, 255;
; SM70-NEXT: shl.b32 %r11, %r10, %r1;
; SM70-NEXT: and.b32 %r12, %r15, %r2;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p3, %r3, %r15;
; SM70-NEXT: mov.b32 %r15, %r3;
; SM70-NEXT: @%p3 bra $L__BB48_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @udec_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: udec_wrap_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<4>;
; SM70-NEXT: .reg .b16 %rs<6>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [udec_wrap_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB49_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: add.s16 %rs3, %rs2, -1;
; SM70-NEXT: setp.eq.b16 %p1, %rs2, 0;
; SM70-NEXT: setp.gt.u16 %p2, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs4, %rs1, %rs3, %p2;
; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p3, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p3 bra $L__BB49_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @udec_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: udec_wrap_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [udec_wrap_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [udec_wrap_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.dec.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @udec_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: udec_wrap_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<4>;
; SM70-NEXT: .reg .b64 %rd<8>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [udec_wrap_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd7, [%rd2];
; SM70-NEXT: $L__BB51_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s64 %rd4, %rd7, -1;
; SM70-NEXT: setp.eq.b64 %p1, %rd7, 0;
; SM70-NEXT: setp.gt.u64 %p2, %rd7, %rd3;
; SM70-NEXT: selp.b64 %rd5, %rd3, %rd4, %p2;
; SM70-NEXT: selp.b64 %rd6, %rd3, %rd5, %p1;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd7, %rd6;
; SM70-NEXT: setp.ne.b64 %p3, %rd1, %rd7;
; SM70-NEXT: mov.b64 %rd7, %rd1;
; SM70-NEXT: @%p3 bra $L__BB51_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @usub_cond_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: usub_cond_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<6>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [usub_cond_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB52_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r15, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
; SM70-NEXT: sub.s16 %rs4, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
; SM70-NEXT: and.b32 %r10, %r9, 255;
; SM70-NEXT: shl.b32 %r11, %r10, %r1;
; SM70-NEXT: and.b32 %r12, %r15, %r2;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r15;
; SM70-NEXT: mov.b32 %r15, %r3;
; SM70-NEXT: @%p2 bra $L__BB52_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw usub_cond ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @usub_cond_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: usub_cond_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [usub_cond_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB53_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
; SM70-NEXT: sub.s16 %rs3, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs4, %rs3, %rs2, %p1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p2 bra $L__BB53_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw usub_cond ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @usub_cond_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: usub_cond_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [usub_cond_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [usub_cond_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB54_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.ge.u32 %p1, %r5, %r2;
; SM70-NEXT: sub.s32 %r3, %r5, %r2;
; SM70-NEXT: selp.b32 %r4, %r3, %r5, %p1;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p2, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p2 bra $L__BB54_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw usub_cond ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @usub_cond_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: usub_cond_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b64 %rd<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [usub_cond_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd2];
; SM70-NEXT: $L__BB55_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
; SM70-NEXT: sub.s64 %rd4, %rd6, %rd3;
; SM70-NEXT: selp.b64 %rd5, %rd4, %rd6, %p1;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
; SM70-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
; SM70-NEXT: mov.b64 %rd6, %rd1;
; SM70-NEXT: @%p2 bra $L__BB55_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw usub_cond ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define i8 @usub_sat_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: usub_sat_acq_rel_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<6>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [usub_sat_acq_rel_i8_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 255;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB56_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: and.b16 %rs3, %rs2, 255;
; SM70-NEXT: max.u16 %rs4, %rs3, %rs1;
; SM70-NEXT: sub.s16 %rs5, %rs4, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB56_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw usub_sat ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
ret i8 %retval
}
define i16 @usub_sat_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
; SM70-LABEL: usub_sat_acq_rel_i16_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<5>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [usub_sat_acq_rel_i16_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i16_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB57_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: max.u16 %rs3, %rs2, %rs1;
; SM70-NEXT: sub.s16 %rs4, %rs3, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB57_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw usub_sat ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
ret i16 %retval
}
define i32 @usub_sat_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: usub_sat_acq_rel_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [usub_sat_acq_rel_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [usub_sat_acq_rel_i32_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB58_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: max.u32 %r3, %r5, %r2;
; SM70-NEXT: sub.s32 %r4, %r3, %r2;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB58_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw usub_sat ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
ret i32 %retval
}
define i64 @usub_sat_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
; SM70-LABEL: usub_sat_acq_rel_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b64 %rd<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [usub_sat_acq_rel_i64_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i64_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd2];
; SM70-NEXT: $L__BB59_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: max.u64 %rd4, %rd6, %rd3;
; SM70-NEXT: sub.s64 %rd5, %rd4, %rd3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
; SM70-NEXT: mov.b64 %rd6, %rd1;
; SM70-NEXT: @%p1 bra $L__BB59_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw usub_sat ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
ret i64 %retval
}
define float @fadd_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fadd_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_float_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [fadd_acq_rel_float_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.add.f32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw fadd ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define float @fsub_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fsub_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<5>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [fsub_acq_rel_float_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [fsub_acq_rel_float_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r4, [%rd1];
; SM70-NEXT: $L__BB61_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: sub.rn.f32 %r3, %r4, %r2;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
; SM70-NEXT: mov.b32 %r4, %r1;
; SM70-NEXT: @%p1 bra $L__BB61_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw fsub ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define float @fmin_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fmin_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<5>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [fmin_acq_rel_float_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [fmin_acq_rel_float_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r4, [%rd1];
; SM70-NEXT: $L__BB62_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: min.f32 %r3, %r4, %r2;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
; SM70-NEXT: mov.b32 %r4, %r1;
; SM70-NEXT: @%p1 bra $L__BB62_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw fmin ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define float @fmax_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fmax_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<5>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [fmax_acq_rel_float_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [fmax_acq_rel_float_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r4, [%rd1];
; SM70-NEXT: $L__BB63_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: max.f32 %r3, %r4, %r2;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
; SM70-NEXT: mov.b32 %r4, %r1;
; SM70-NEXT: @%p1 bra $L__BB63_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw fmax ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define float @fminimum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fminimum_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<6>;
; SM70-NEXT: .reg .b32 %r<9>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [fminimum_acq_rel_float_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [fminimum_acq_rel_float_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r8, [%rd1];
; SM70-NEXT: setp.eq.b32 %p3, %r2, -2147483648;
; SM70-NEXT: $L__BB64_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.nan.f32 %p1, %r8, %r2;
; SM70-NEXT: min.f32 %r3, %r8, %r2;
; SM70-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
; SM70-NEXT: setp.eq.b32 %p2, %r8, -2147483648;
; SM70-NEXT: selp.f32 %r5, %r8, %r4, %p2;
; SM70-NEXT: selp.f32 %r6, %r2, %r5, %p3;
; SM70-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
; SM70-NEXT: selp.f32 %r7, %r6, %r4, %p4;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
; SM70-NEXT: setp.ne.b32 %p5, %r1, %r8;
; SM70-NEXT: mov.b32 %r8, %r1;
; SM70-NEXT: @%p5 bra $L__BB64_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw fminimum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define float @fmaximum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
; SM70-LABEL: fmaximum_acq_rel_float_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<6>;
; SM70-NEXT: .reg .b32 %r<9>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [fmaximum_acq_rel_float_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [fmaximum_acq_rel_float_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r8, [%rd1];
; SM70-NEXT: setp.eq.b32 %p3, %r2, 0;
; SM70-NEXT: $L__BB65_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.nan.f32 %p1, %r8, %r2;
; SM70-NEXT: max.f32 %r3, %r8, %r2;
; SM70-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
; SM70-NEXT: setp.eq.b32 %p2, %r8, 0;
; SM70-NEXT: selp.f32 %r5, %r8, %r4, %p2;
; SM70-NEXT: selp.f32 %r6, %r2, %r5, %p3;
; SM70-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
; SM70-NEXT: selp.f32 %r7, %r6, %r4, %p4;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
; SM70-NEXT: setp.ne.b32 %p5, %r1, %r8;
; SM70-NEXT: mov.b32 %r8, %r1;
; SM70-NEXT: @%p5 bra $L__BB65_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw fmaximum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
ret float %retval
}
define double @fadd_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fadd_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_double_global_cta_param_0];
; SM70-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_double_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.add.f64 %rd3, [%rd1], %rd2;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
%retval = atomicrmw fadd ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define double @fsub_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fsub_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b64 %rd<6>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [fsub_acq_rel_double_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_double_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd5, [%rd2];
; SM70-NEXT: $L__BB67_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: sub.rn.f64 %rd4, %rd5, %rd3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
; SM70-NEXT: mov.b64 %rd5, %rd1;
; SM70-NEXT: @%p1 bra $L__BB67_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw fsub ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define double @fmin_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fmin_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b64 %rd<6>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [fmin_acq_rel_double_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_double_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd5, [%rd2];
; SM70-NEXT: $L__BB68_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: min.f64 %rd4, %rd5, %rd3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
; SM70-NEXT: mov.b64 %rd5, %rd1;
; SM70-NEXT: @%p1 bra $L__BB68_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw fmin ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define double @fmax_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fmax_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b64 %rd<6>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [fmax_acq_rel_double_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_double_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd5, [%rd2];
; SM70-NEXT: $L__BB69_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: max.f64 %rd4, %rd5, %rd3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
; SM70-NEXT: mov.b64 %rd5, %rd1;
; SM70-NEXT: @%p1 bra $L__BB69_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw fmax ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define double @fminimum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fminimum_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<6>;
; SM70-NEXT: .reg .b64 %rd<10>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [fminimum_acq_rel_double_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_double_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd9, [%rd2];
; SM70-NEXT: setp.eq.b64 %p3, %rd3, -9223372036854775808;
; SM70-NEXT: $L__BB70_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
; SM70-NEXT: min.f64 %rd4, %rd9, %rd3;
; SM70-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
; SM70-NEXT: setp.eq.b64 %p2, %rd9, -9223372036854775808;
; SM70-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
; SM70-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
; SM70-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
; SM70-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
; SM70-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
; SM70-NEXT: mov.b64 %rd9, %rd1;
; SM70-NEXT: @%p5 bra $L__BB70_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw fminimum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define double @fmaximum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
; SM70-LABEL: fmaximum_acq_rel_double_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<6>;
; SM70-NEXT: .reg .b64 %rd<10>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd3, [fmaximum_acq_rel_double_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_double_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b64 %rd9, [%rd2];
; SM70-NEXT: setp.eq.b64 %p3, %rd3, 0;
; SM70-NEXT: $L__BB71_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
; SM70-NEXT: max.f64 %rd4, %rd9, %rd3;
; SM70-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
; SM70-NEXT: setp.eq.b64 %p2, %rd9, 0;
; SM70-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
; SM70-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
; SM70-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
; SM70-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
; SM70-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
; SM70-NEXT: mov.b64 %rd9, %rd1;
; SM70-NEXT: @%p5 bra $L__BB71_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
; SM70-NEXT: ret;
%retval = atomicrmw fmaximum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
ret double %retval
}
define half @fadd_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fadd_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .b16 %rs<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_half_global_cta_param_0];
; SM70-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_half_global_cta_param_1];
; SM70-NEXT: atom.acq_rel.cta.global.add.noftz.f16 %rs2, [%rd1], %rs1;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
%retval = atomicrmw fadd ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define half @fsub_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fsub_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_half_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_half_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: $L__BB73_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: sub.rn.f16 %rs3, %rs2, %rs1;
; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p1 bra $L__BB73_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw fsub ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define half @fmin_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fmin_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<18>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_half_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_half_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r17, [%rd1];
; SM70-NEXT: cvt.f32.f16 %r10, %rs1;
; SM70-NEXT: $L__BB74_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r17, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: cvt.f32.f16 %r9, %rs2;
; SM70-NEXT: min.f32 %r11, %r9, %r10;
; SM70-NEXT: cvt.rn.f16.f32 %rs3, %r11;
; SM70-NEXT: cvt.u32.u16 %r12, %rs3;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: and.b32 %r14, %r17, %r2;
; SM70-NEXT: or.b32 %r15, %r14, %r13;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r17;
; SM70-NEXT: mov.b32 %r17, %r3;
; SM70-NEXT: @%p1 bra $L__BB74_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r16, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r16;
; SM70-NEXT: ret;
%retval = atomicrmw fmin ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define half @fmax_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fmax_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<4>;
; SM70-NEXT: .reg .b32 %r<18>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_half_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_half_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r17, [%rd1];
; SM70-NEXT: cvt.f32.f16 %r10, %rs1;
; SM70-NEXT: $L__BB75_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r17, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: cvt.f32.f16 %r9, %rs2;
; SM70-NEXT: max.f32 %r11, %r9, %r10;
; SM70-NEXT: cvt.rn.f16.f32 %rs3, %r11;
; SM70-NEXT: cvt.u32.u16 %r12, %rs3;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: and.b32 %r14, %r17, %r2;
; SM70-NEXT: or.b32 %r15, %r14, %r13;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
; SM70-NEXT: setp.ne.b32 %p1, %r3, %r17;
; SM70-NEXT: mov.b32 %r17, %r3;
; SM70-NEXT: @%p1 bra $L__BB75_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r16, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r16;
; SM70-NEXT: ret;
%retval = atomicrmw fmax ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define half @fminimum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fminimum_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<7>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_half_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_half_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: setp.eq.b16 %p4, %rs1, -32768;
; SM70-NEXT: $L__BB76_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: setp.lt.f16 %p1, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
; SM70-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
; SM70-NEXT: setp.eq.b16 %p3, %rs2, -32768;
; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
; SM70-NEXT: mov.b16 %rs7, 0x0000;
; SM70-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
; SM70-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
; SM70-NEXT: cvt.u32.u16 %r9, %rs8;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p6, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p6 bra $L__BB76_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw fminimum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define half @fmaximum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
; SM70-LABEL: fmaximum_acq_rel_half_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<7>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<15>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_half_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_half_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r14, [%rd1];
; SM70-NEXT: setp.eq.b16 %p4, %rs1, 0;
; SM70-NEXT: $L__BB77_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r14, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: setp.gt.f16 %p1, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
; SM70-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
; SM70-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
; SM70-NEXT: setp.eq.b16 %p3, %rs2, 0;
; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
; SM70-NEXT: mov.b16 %rs7, 0x0000;
; SM70-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
; SM70-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
; SM70-NEXT: cvt.u32.u16 %r9, %rs8;
; SM70-NEXT: shl.b32 %r10, %r9, %r1;
; SM70-NEXT: and.b32 %r11, %r14, %r2;
; SM70-NEXT: or.b32 %r12, %r11, %r10;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
; SM70-NEXT: setp.ne.b32 %p6, %r3, %r14;
; SM70-NEXT: mov.b32 %r14, %r3;
; SM70-NEXT: @%p6 bra $L__BB77_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r13, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r13;
; SM70-NEXT: ret;
%retval = atomicrmw fmaximum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
ret half %retval
}
define bfloat @fadd_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fadd_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r23, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: $L__BB78_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r23, %r1;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: add.rn.f32 %r12, %r9, %r11;
; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
; SM70-NEXT: add.s32 %r14, %r13, %r12;
; SM70-NEXT: add.s32 %r15, %r14, 32767;
; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
; SM70-NEXT: or.b32 %r16, %r12, 4194304;
; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
; SM70-NEXT: shr.u32 %r18, %r17, 16;
; SM70-NEXT: shl.b32 %r19, %r18, %r1;
; SM70-NEXT: and.b32 %r20, %r23, %r2;
; SM70-NEXT: or.b32 %r21, %r20, %r19;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
; SM70-NEXT: mov.b32 %r23, %r3;
; SM70-NEXT: @%p2 bra $L__BB78_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r22, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r22;
; SM70-NEXT: ret;
%retval = atomicrmw fadd ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define bfloat @fsub_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fsub_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r23, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: $L__BB79_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r23, %r1;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: sub.rn.f32 %r12, %r9, %r11;
; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
; SM70-NEXT: add.s32 %r14, %r13, %r12;
; SM70-NEXT: add.s32 %r15, %r14, 32767;
; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
; SM70-NEXT: or.b32 %r16, %r12, 4194304;
; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
; SM70-NEXT: shr.u32 %r18, %r17, 16;
; SM70-NEXT: shl.b32 %r19, %r18, %r1;
; SM70-NEXT: and.b32 %r20, %r23, %r2;
; SM70-NEXT: or.b32 %r21, %r20, %r19;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
; SM70-NEXT: mov.b32 %r23, %r3;
; SM70-NEXT: @%p2 bra $L__BB79_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r22, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r22;
; SM70-NEXT: ret;
%retval = atomicrmw fsub ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define bfloat @fmin_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fmin_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r23, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: $L__BB80_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r23, %r1;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: min.f32 %r12, %r9, %r11;
; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
; SM70-NEXT: add.s32 %r14, %r13, %r12;
; SM70-NEXT: add.s32 %r15, %r14, 32767;
; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
; SM70-NEXT: or.b32 %r16, %r12, 4194304;
; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
; SM70-NEXT: shr.u32 %r18, %r17, 16;
; SM70-NEXT: shl.b32 %r19, %r18, %r1;
; SM70-NEXT: and.b32 %r20, %r23, %r2;
; SM70-NEXT: or.b32 %r21, %r20, %r19;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
; SM70-NEXT: mov.b32 %r23, %r3;
; SM70-NEXT: @%p2 bra $L__BB80_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r22, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r22;
; SM70-NEXT: ret;
%retval = atomicrmw fmin ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define bfloat @fmax_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fmax_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r23, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: $L__BB81_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r23, %r1;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: max.f32 %r12, %r9, %r11;
; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
; SM70-NEXT: add.s32 %r14, %r13, %r12;
; SM70-NEXT: add.s32 %r15, %r14, 32767;
; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
; SM70-NEXT: or.b32 %r16, %r12, 4194304;
; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
; SM70-NEXT: shr.u32 %r18, %r17, 16;
; SM70-NEXT: shl.b32 %r19, %r18, %r1;
; SM70-NEXT: and.b32 %r20, %r23, %r2;
; SM70-NEXT: or.b32 %r21, %r20, %r19;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
; SM70-NEXT: mov.b32 %r23, %r3;
; SM70-NEXT: @%p2 bra $L__BB81_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r22, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r22;
; SM70-NEXT: ret;
%retval = atomicrmw fmax ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define bfloat @fminimum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fminimum_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<7>;
; SM70-NEXT: .reg .b16 %rs<8>;
; SM70-NEXT: .reg .b32 %r<20>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r19, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: setp.eq.b16 %p4, %rs1, -32768;
; SM70-NEXT: $L__BB82_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r19, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: setp.lt.f32 %p1, %r9, %r11;
; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
; SM70-NEXT: setp.nan.f32 %p2, %r9, %r11;
; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
; SM70-NEXT: setp.eq.b16 %p3, %rs2, -32768;
; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
; SM70-NEXT: cvt.u32.u16 %r12, %rs4;
; SM70-NEXT: shl.b32 %r13, %r12, 16;
; SM70-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
; SM70-NEXT: cvt.u32.u16 %r14, %rs7;
; SM70-NEXT: shl.b32 %r15, %r14, %r1;
; SM70-NEXT: and.b32 %r16, %r19, %r2;
; SM70-NEXT: or.b32 %r17, %r16, %r15;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
; SM70-NEXT: setp.ne.b32 %p6, %r3, %r19;
; SM70-NEXT: mov.b32 %r19, %r3;
; SM70-NEXT: @%p6 bra $L__BB82_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r18, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r18;
; SM70-NEXT: ret;
%retval = atomicrmw fminimum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define bfloat @fmaximum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
; SM70-LABEL: fmaximum_acq_rel_bfloat_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<7>;
; SM70-NEXT: .reg .b16 %rs<8>;
; SM70-NEXT: .reg .b32 %r<20>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_bfloat_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_bfloat_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
; SM70-NEXT: and.b32 %r5, %r4, 3;
; SM70-NEXT: shl.b32 %r1, %r5, 3;
; SM70-NEXT: mov.b32 %r6, 65535;
; SM70-NEXT: shl.b32 %r7, %r6, %r1;
; SM70-NEXT: not.b32 %r2, %r7;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r19, [%rd1];
; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
; SM70-NEXT: shl.b32 %r11, %r10, 16;
; SM70-NEXT: setp.eq.b16 %p4, %rs1, 0;
; SM70-NEXT: $L__BB83_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: shr.u32 %r8, %r19, %r1;
; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
; SM70-NEXT: shl.b32 %r9, %r8, 16;
; SM70-NEXT: setp.gt.f32 %p1, %r9, %r11;
; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
; SM70-NEXT: setp.nan.f32 %p2, %r9, %r11;
; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
; SM70-NEXT: setp.eq.b16 %p3, %rs2, 0;
; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
; SM70-NEXT: cvt.u32.u16 %r12, %rs4;
; SM70-NEXT: shl.b32 %r13, %r12, 16;
; SM70-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
; SM70-NEXT: cvt.u32.u16 %r14, %rs7;
; SM70-NEXT: shl.b32 %r15, %r14, %r1;
; SM70-NEXT: and.b32 %r16, %r19, %r2;
; SM70-NEXT: or.b32 %r17, %r16, %r15;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
; SM70-NEXT: setp.ne.b32 %p6, %r3, %r19;
; SM70-NEXT: mov.b32 %r19, %r3;
; SM70-NEXT: @%p6 bra $L__BB83_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r18, %r3, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b16 [func_retval0], %r18;
; SM70-NEXT: ret;
%retval = atomicrmw fmaximum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
ret bfloat %retval
}
define i8 @add_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: add_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_monotonic_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: ld.param.b8 %r6, [add_monotonic_i8_global_cta_param_1];
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB84_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB84_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
ret i8 %retval
}
define i8 @add_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: add_acquire_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_acquire_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: ld.param.b8 %r6, [add_acquire_i8_global_cta_param_1];
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB85_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB85_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
ret i8 %retval
}
define i8 @add_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: add_release_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_release_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r6, [add_release_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB86_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB86_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") release
ret i8 %retval
}
define i8 @add_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: add_seq_cst_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [add_seq_cst_i8_global_cta_param_0];
; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: ld.param.b8 %r6, [add_seq_cst_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r15, [%rd1];
; SM70-NEXT: $L__BB87_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: add.s32 %r10, %r15, %r4;
; SM70-NEXT: and.b32 %r11, %r10, %r2;
; SM70-NEXT: and.b32 %r12, %r15, %r3;
; SM70-NEXT: or.b32 %r13, %r12, %r11;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
; SM70-NEXT: mov.b32 %r15, %r5;
; SM70-NEXT: @%p1 bra $L__BB87_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r14, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
ret i8 %retval
}
define i32 @add_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: add_monotonic_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_monotonic_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [add_monotonic_i32_global_cta_param_1];
; SM70-NEXT: atom.relaxed.cta.global.add.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
ret i32 %retval
}
define i32 @add_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: add_acquire_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_acquire_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [add_acquire_i32_global_cta_param_1];
; SM70-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
ret i32 %retval
}
define i32 @add_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: add_release_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_release_i32_global_cta_param_0];
; SM70-NEXT: ld.param.b32 %r1, [add_release_i32_global_cta_param_1];
; SM70-NEXT: atom.release.cta.global.add.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") release
ret i32 %retval
}
define i32 @add_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: add_seq_cst_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .b32 %r<3>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [add_seq_cst_i32_global_cta_param_0];
; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: ld.param.b32 %r1, [add_seq_cst_i32_global_cta_param_1];
; SM70-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
%retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
ret i32 %retval
}
define i8 @nand_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: nand_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_monotonic_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: ld.param.b8 %r6, [nand_monotonic_i8_global_cta_param_1];
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB92_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB92_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
ret i8 %retval
}
define i8 @nand_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: nand_acquire_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_acquire_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: ld.param.b8 %r6, [nand_acquire_i8_global_cta_param_1];
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB93_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB93_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
ret i8 %retval
}
define i8 @nand_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: nand_release_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_release_i8_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.param.b8 %r6, [nand_release_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB94_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB94_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") release
ret i8 %retval
}
define i8 @nand_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: nand_seq_cst_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<17>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd2, [nand_seq_cst_i8_global_cta_param_0];
; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: ld.param.b8 %r6, [nand_seq_cst_i8_global_cta_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
; SM70-NEXT: and.b32 %r8, %r7, 3;
; SM70-NEXT: shl.b32 %r1, %r8, 3;
; SM70-NEXT: mov.b32 %r9, 255;
; SM70-NEXT: shl.b32 %r2, %r9, %r1;
; SM70-NEXT: not.b32 %r3, %r2;
; SM70-NEXT: shl.b32 %r4, %r6, %r1;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r16, [%rd1];
; SM70-NEXT: $L__BB95_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r10, %r16, %r4;
; SM70-NEXT: not.b32 %r11, %r10;
; SM70-NEXT: and.b32 %r12, %r11, %r2;
; SM70-NEXT: and.b32 %r13, %r16, %r3;
; SM70-NEXT: or.b32 %r14, %r13, %r12;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
; SM70-NEXT: mov.b32 %r16, %r5;
; SM70-NEXT: @%p1 bra $L__BB95_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: shr.u32 %r15, %r5, %r1;
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
ret i8 %retval
}
define i32 @nand_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: nand_monotonic_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [nand_monotonic_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [nand_monotonic_i32_global_cta_param_0];
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB96_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r3, %r5, %r2;
; SM70-NEXT: not.b32 %r4, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB96_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
ret i32 %retval
}
define i32 @nand_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: nand_acquire_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [nand_acquire_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [nand_acquire_i32_global_cta_param_0];
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB97_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r3, %r5, %r2;
; SM70-NEXT: not.b32 %r4, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB97_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
ret i32 %retval
}
define i32 @nand_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: nand_release_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [nand_release_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [nand_release_i32_global_cta_param_0];
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB98_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r3, %r5, %r2;
; SM70-NEXT: not.b32 %r4, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB98_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") release
ret i32 %retval
}
define i32 @nand_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
; SM70-LABEL: nand_seq_cst_i32_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b32 %r<6>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r2, [nand_seq_cst_i32_global_cta_param_1];
; SM70-NEXT: ld.param.b64 %rd1, [nand_seq_cst_i32_global_cta_param_0];
; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: ld.relaxed.cta.global.b32 %r5, [%rd1];
; SM70-NEXT: $L__BB99_1: // %atomicrmw.start
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: and.b32 %r3, %r5, %r2;
; SM70-NEXT: not.b32 %r4, %r3;
; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
; SM70-NEXT: mov.b32 %r5, %r1;
; SM70-NEXT: @%p1 bra $L__BB99_1;
; SM70-NEXT: // %bb.2: // %atomicrmw.end
; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
; SM70-NEXT: ret;
%retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
ret i32 %retval
}