IR should be a splat of 7 as this compares vector of elements with 7 (`vec[i]!=7`). Having `zeroinitializer` goes against this comparison. Co-authored-by: himadhith <himadhith.v@ibm.com>
75 lines
3.0 KiB
LLVM
75 lines
3.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
|
|
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE
|
|
|
|
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
|
|
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64
|
|
|
|
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
|
|
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32
|
|
|
|
; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
|
|
; 0XFFFF -> 0
|
|
; 0 -> 1
|
|
; An optimized version is to follow this NFC patch
|
|
|
|
define i32 @cols_needed(<4 x i16> %wide.load) {
|
|
; POWERPC_64LE-LABEL: cols_needed:
|
|
; POWERPC_64LE: # %bb.0: # %entry
|
|
; POWERPC_64LE-NEXT: vspltish v3, 7
|
|
; POWERPC_64LE-NEXT: li r3, 0
|
|
; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3
|
|
; POWERPC_64LE-NEXT: vspltisw v3, 1
|
|
; POWERPC_64LE-NEXT: xxlnor v2, v2, v2
|
|
; POWERPC_64LE-NEXT: vmrglh v2, v2, v2
|
|
; POWERPC_64LE-NEXT: xxland v2, v2, v3
|
|
; POWERPC_64LE-NEXT: xxswapd v3, v2
|
|
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_64LE-NEXT: xxspltw v3, v2, 2
|
|
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
|
|
; POWERPC_64LE-NEXT: blr
|
|
;
|
|
; POWERPC_64-LABEL: cols_needed:
|
|
; POWERPC_64: # %bb.0: # %entry
|
|
; POWERPC_64-NEXT: vspltish v3, 7
|
|
; POWERPC_64-NEXT: li r3, 0
|
|
; POWERPC_64-NEXT: vcmpequh v2, v2, v3
|
|
; POWERPC_64-NEXT: vspltisw v3, 1
|
|
; POWERPC_64-NEXT: xxlnor v2, v2, v2
|
|
; POWERPC_64-NEXT: vmrghh v2, v2, v2
|
|
; POWERPC_64-NEXT: xxland v2, v2, v3
|
|
; POWERPC_64-NEXT: xxswapd v3, v2
|
|
; POWERPC_64-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_64-NEXT: xxspltw v3, v2, 1
|
|
; POWERPC_64-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_64-NEXT: vextuwlx r3, r3, v2
|
|
; POWERPC_64-NEXT: blr
|
|
;
|
|
; POWERPC_32-LABEL: cols_needed:
|
|
; POWERPC_32: # %bb.0: # %entry
|
|
; POWERPC_32-NEXT: vspltish v3, 7
|
|
; POWERPC_32-NEXT: vcmpequh v2, v2, v3
|
|
; POWERPC_32-NEXT: vspltisw v3, 1
|
|
; POWERPC_32-NEXT: xxlnor v2, v2, v2
|
|
; POWERPC_32-NEXT: vmrghh v2, v2, v2
|
|
; POWERPC_32-NEXT: xxland v2, v2, v3
|
|
; POWERPC_32-NEXT: xxswapd v3, v2
|
|
; POWERPC_32-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_32-NEXT: xxspltw v3, v2, 1
|
|
; POWERPC_32-NEXT: vadduwm v2, v2, v3
|
|
; POWERPC_32-NEXT: stxv v2, -16(r1)
|
|
; POWERPC_32-NEXT: lwz r3, -16(r1)
|
|
; POWERPC_32-NEXT: blr
|
|
entry:
|
|
%0 = icmp ne <4 x i16> %wide.load, splat (i16 7)
|
|
%1 = zext <4 x i1> %0 to <4 x i32>
|
|
%2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
|
|
ret i32 %2
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0
|
|
|
|
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|