It's the a continuation of previously reverted https://github.com/llvm/llvm-project/pull/178420 The patch removes custom AMDGPUISD::FFBH_I32 SelectionDAG node. Call sites that need raw hardware semantics (LowerINT_TO_FP32, legalizeITOFP) now use amdgcn_sffbh intrinsic directly. ISD::CTLS is added as a Custom operation for i32. Previous attempt had an issue: The hardware v_ffbh_i32 instruction (v_cls_i32 on newer targets) has different semantics than ISD::CTLS: -sffbh returns [1, BitWidth-1] for normal values, -1 for all-same-bits -CTLS returns [0, BitWidth-2] for normal values, BitWidth-1 for all-same-bits Now LowerCTLS handles this by: sffbh -> umin(sffbh, BitWidth) -> sub 1. Current patch also adds DAG combine to recognize the common CTLS idiom: sub(ctlz(xor(x, sra(x, BitWidth-1))), 1) -> ctls(x) and an optimization in performMinMaxCombine to fold away umin when the input is not all-same-bits. Partially addresses #177635
625 lines
24 KiB
LLVM
625 lines
24 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
|
|
|
|
declare i32 @llvm.ctlz.i32(i32, i1)
|
|
declare i64 @llvm.ctlz.i64(i64, i1)
|
|
declare i32 @llvm.amdgcn.sffbh.i32(i32)
|
|
|
|
; Test that ctls(x) is lowered to umin(ffbh_i32(x), bitwidth) - 1
|
|
; ctls is formed by the DAG combiner from: ctlz(x ^ ashr(x, 31)) - 1
|
|
define i32 @ctls_i32(i32 %x) {
|
|
; GFX6-LABEL: ctls_i32:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %x, 31
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
define i32 @ctls_i32_known_positive(i32 %x) {
|
|
; GFX6-LABEL: ctls_i32_known_positive:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i32_known_positive:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%pos = and i32 %x, 2147483647
|
|
%a = ashr i32 %pos, 31
|
|
%b = xor i32 %pos, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
; sub(ctlz(xor(x, sra(x, 31))), 1) -> ctls(x)
|
|
define i32 @ctls_i32_xor_commuted(i32 %x) {
|
|
; GFX6-LABEL: ctls_i32_xor_commuted:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i32_xor_commuted:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %x, 31
|
|
%b = xor i32 %a, %x ; note: reversed order compared to ctls_i32
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
define i32 @ctls_i32_zero_undef(i32 %x) {
|
|
; GFX6-LABEL: ctls_i32_zero_undef:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i32_zero_undef:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %x, 31
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 true) ; zero_undef = true
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
; umin(ffbh_i32(x), 32) -> ffbh_i32(x).
|
|
define i32 @ctls_i32_known_mixed_bits(i32 %x) {
|
|
; GFX6-LABEL: ctls_i32_known_mixed_bits:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
|
|
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i32_known_mixed_bits:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_or_b32_e32 v0, 1, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
; Force bit 31 = 0 and bit 0 = 1, so value is neither all-0s nor all-1s
|
|
%cleared = and i32 %x, 2147483647 ; clear bit 31
|
|
%mixed = or i32 %cleared, 1 ; set bit 0
|
|
%a = ashr i32 %mixed, 31
|
|
%b = xor i32 %mixed, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
; test for i64 CTLS.
|
|
define i32 @ctls_i64(i64 %x) {
|
|
; GFX6-LABEL: ctls_i64:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_min3_u32 v0, v0, v1, 64
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i64:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i64 %x, 63
|
|
%b = xor i64 %x, %a
|
|
%c = call i64 @llvm.ctlz.i64(i64 %b, i1 false)
|
|
%d = sub i64 %c, 1
|
|
%e = trunc i64 %d to i32
|
|
ret i32 %e
|
|
}
|
|
|
|
; i16 CTLS via the sub(ctlz(xor(x, sra(x, 15))), 1) pattern.
|
|
declare i16 @llvm.ctlz.i16(i16, i1)
|
|
define i16 @ctls_i16(i16 %x) {
|
|
; GFX6-LABEL: ctls_i16:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, 17, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_i16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i16 v0.h, 15, v0.l
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_xor_b16 v0.l, v0.l, v0.h
|
|
; GFX11-NEXT: v_mov_b16_e32 v0.h, 0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u16 v0.l, v0.l, -1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i16 %x, 15
|
|
%b = xor i16 %x, %a
|
|
%c = call i16 @llvm.ctlz.i16(i16 %b, i1 false)
|
|
%d = sub i16 %c, 1
|
|
ret i16 %d
|
|
}
|
|
|
|
; uniform input should use scalar sffbh.
|
|
define amdgpu_ps i32 @ctls_i32_salu(i32 inreg %x) {
|
|
; GFX6-LABEL: ctls_i32_salu:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_flbit_i32 s0, s0
|
|
; GFX6-NEXT: s_min_u32 s0, s0, 32
|
|
; GFX6-NEXT: s_add_i32 s0, s0, -1
|
|
; GFX6-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: ctls_i32_salu:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_cls_i32 s0, s0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX11-NEXT: s_min_u32 s0, s0, 32
|
|
; GFX11-NEXT: s_add_i32 s0, s0, -1
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
%a = ashr i32 %x, 31
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
define <2 x i32> @ctls_v2i32(<2 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v2i32:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
|
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v2i32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
|
%b = xor <2 x i32> %x, %a
|
|
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
|
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
|
ret <2 x i32> %d
|
|
}
|
|
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
|
|
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
|
|
|
|
define <4 x i32> @ctls_v4i32(<4 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v4i32:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v2, v2
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v3, v3
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX6-NEXT: v_min_u32_e32 v2, 32, v2
|
|
; GFX6-NEXT: v_min_u32_e32 v3, 32, v3
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: v_add_i32_e32 v2, vcc, -1, v2
|
|
; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v4i32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: v_cls_i32_e32 v1, v1
|
|
; GFX11-NEXT: v_cls_i32_e32 v2, v2
|
|
; GFX11-NEXT: v_cls_i32_e32 v3, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
|
|
; GFX11-NEXT: v_min_u32_e32 v3, 32, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v2
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
|
|
%b = xor <4 x i32> %x, %a
|
|
%c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %b, i1 false)
|
|
%d = sub <4 x i32> %c, <i32 1, i32 1, i32 1, i32 1>
|
|
ret <4 x i32> %d
|
|
}
|
|
|
|
; umin should be folded away per element per element.
|
|
define <2 x i32> @ctls_v2i32_known_mixed_bits(<2 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v2i32_known_mixed_bits:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_or_b32_e32 v1, 1, v1
|
|
; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
|
|
; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
|
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v2i32_known_mixed_bits:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_or_b32_e32 v0, 1, v0
|
|
; GFX11-NEXT: v_or_b32_e32 v1, 1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cleared = and <2 x i32> %x, <i32 2147483647, i32 2147483647>
|
|
%mixed = or <2 x i32> %cleared, <i32 1, i32 1>
|
|
%a = ashr <2 x i32> %mixed, <i32 31, i32 31>
|
|
%b = xor <2 x i32> %mixed, %a
|
|
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
|
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
|
ret <2 x i32> %d
|
|
}
|
|
|
|
; Vector with ctlz_zero_undef.
|
|
define <2 x i32> @ctls_v2i32_zero_undef(<2 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v2i32_zero_undef:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
|
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v2i32_zero_undef:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
|
%b = xor <2 x i32> %x, %a
|
|
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 true)
|
|
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
|
ret <2 x i32> %d
|
|
}
|
|
|
|
; Vector commuted XOR operands.
|
|
define <2 x i32> @ctls_v2i32_xor_commuted(<2 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v2i32_xor_commuted:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
|
; GFX6-NEXT: v_xor_b32_e32 v1, v2, v1
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v3, v0
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v2i32_xor_commuted:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v2, v0
|
|
; GFX11-NEXT: v_xor_b32_e32 v1, v3, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
|
%b = xor <2 x i32> %a, %x
|
|
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
|
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
|
ret <2 x i32> %d
|
|
}
|
|
|
|
; Vector known positive: umin should NOT be folded.
|
|
define <2 x i32> @ctls_v2i32_known_positive(<2 x i32> %x) {
|
|
; GFX6-LABEL: ctls_v2i32_known_positive:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
|
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: ctls_v2i32_known_positive:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
|
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%pos = and <2 x i32> %x, <i32 2147483647, i32 2147483647>
|
|
%a = ashr <2 x i32> %pos, <i32 31, i32 31>
|
|
%b = xor <2 x i32> %pos, %a
|
|
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
|
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
|
ret <2 x i32> %d
|
|
}
|
|
|
|
; @llvm.amdgcn.sffbh must still produce raw hardware result.
|
|
define i32 @sffbh_intrinsic(i32 %x) {
|
|
; GFX6-LABEL: sffbh_intrinsic:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: sffbh_intrinsic:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%r = call i32 @llvm.amdgcn.sffbh.i32(i32 %x)
|
|
ret i32 %r
|
|
}
|
|
|
|
; sitofp i64 to f32 uses sffbh(Hi)-1, not CTLS.
|
|
define float @sitofp_i64_to_f32(i64 %x) {
|
|
; GFX6-LABEL: sitofp_i64_to_f32:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_xor_b32_e32 v2, v0, v1
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v2
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v3, v1
|
|
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 32, v2
|
|
; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3
|
|
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
|
|
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
|
|
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
|
; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
|
|
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: sitofp_i64_to_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
|
|
; GFX11-NEXT: v_cls_i32_e32 v3, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
|
|
; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
|
|
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
|
|
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%r = sitofp i64 %x to float
|
|
ret float %r
|
|
}
|
|
|
|
; Negative tests:
|
|
define i32 @no_ctls_wrong_shift(i32 %x) {
|
|
; GFX6-LABEL: no_ctls_wrong_shift:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 30, v0
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: no_ctls_wrong_shift:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 30, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %x, 30
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
define i32 @no_ctls_xor_different_value(i32 %x, i32 %y) {
|
|
; GFX6-LABEL: no_ctls_xor_different_value:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v1
|
|
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1
|
|
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: no_ctls_xor_different_value:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
|
|
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %y, 31
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 1
|
|
ret i32 %d
|
|
}
|
|
|
|
define i32 @no_ctls_sub_2(i32 %x) {
|
|
; GFX6-LABEL: no_ctls_sub_2:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
|
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -2, v0
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: no_ctls_sub_2:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v0, -2, v0
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%a = ashr i32 %x, 31
|
|
%b = xor i32 %x, %a
|
|
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
|
%d = sub i32 %c, 2
|
|
ret i32 %d
|
|
}
|