From d57c4e4c3c929169583627c44f98ba701d2771e4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Apr 2026 13:37:35 +0100 Subject: [PATCH] [X86] Add basic ISD::VECREDUCE_AND/OR/XOR handling (#195063) Custom lower ISD::VECREDUCE_AND/OR/XOR using vector logic ops Handling of any_of/all_of/parity patterns will happen later once we start dismantling combinePredicateReduction() --- llvm/lib/Target/X86/X86ISelLowering.cpp | 49 +++++ .../CodeGen/X86/vector-extract-last-active.ll | 176 ++++++++---------- llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 67 +++---- 3 files changed, 145 insertions(+), 147 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 78a1dc9e790c..79929a66fd19 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1159,6 +1159,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); } // SSE2 can use basic vector unrolling. @@ -1552,6 +1555,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); if (VT == MVT::v4i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); @@ -2021,6 +2027,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -29648,6 +29657,43 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } +// Generic x86 vector reduction expansion. +static SDValue LowerVECREDUCE(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + ISD::NodeType BinOp = ISD::getVecReduceBaseOpcode(Op.getOpcode()); + assert(DAG.getTargetLoweringInfo().isBinOp(BinOp) && + "Only binops expected to be used by reductions"); + + EVT ExtractVT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getScalarType(); + SDLoc DL(Op); + + if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) + return SDValue(); + + // Split vector down to 128-bits, performing bin to lo/hi subvectors. + while (SrcVT.getSizeInBits() > 128) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(Src, DAG, DL); + SrcVT = Lo.getValueType(); + Src = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); + } + assert(SrcVT.is128BitVector() && "Unexpected value type"); + + // Expand 128-bit shuffle tree + reduction binops. + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + for (unsigned NumElts = NumSrcElts; NumElts != 1; NumElts /= 2) { + SmallVector Mask(NumSrcElts, -1); + std::iota(Mask.begin(), Mask.begin() + (NumElts / 2), NumElts / 2); + SDValue Upper = + DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), Mask); + Src = DAG.getNode(BinOp, DL, SrcVT, Src, Upper); + } + return DAG.getExtractVectorElt(DL, ExtractVT, Src, 0); +} + static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -34480,6 +34526,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: return LowerMINMAX_REDUCE(Op, Subtarget, DAG); + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: return LowerVECREDUCE(Op, Subtarget, DAG); case ISD::FMINIMUM: case ISD::FMAXIMUM: case ISD::FMINIMUMNUM: diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll index c8a59562f950..3f622c5c9807 100644 --- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll @@ -8,35 +8,30 @@ define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) { ; CHECK-LABEL: extract_last_active_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; CHECK-NEXT: movd %xmm2, %eax ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: movd %xmm2, %ecx -; CHECK-NEXT: movd %xmm1, %edx -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; CHECK-NEXT: movd %xmm2, %esi +; CHECK-NEXT: por %xmm1, %xmm2 ; CHECK-NEXT: pslld $31, %xmm1 ; CHECK-NEXT: psrad $31, %xmm1 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; CHECK-NEXT: movd %xmm0, %edi -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; CHECK-NEXT: movd %xmm0, %r8d -; CHECK-NEXT: cmpl %edi, %r8d -; CHECK-NEXT: cmoval %r8d, %edi -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; CHECK-NEXT: movd %xmm0, %r8d -; CHECK-NEXT: cmpl %r8d, %edi -; CHECK-NEXT: cmovbel %r8d, %edi -; CHECK-NEXT: orl %edx, %esi -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpb $1, %cl ; CHECK-NEXT: sbbl %eax, %eax -; CHECK-NEXT: orl -24(%rsp,%rdi,4), %eax +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; CHECK-NEXT: movd %xmm0, %edx +; CHECK-NEXT: cmpl %ecx, %edx +; CHECK-NEXT: cmoval %edx, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; CHECK-NEXT: movd %xmm0, %edx +; CHECK-NEXT: cmpl %edx, %ecx +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax ; CHECK-NEXT: retq %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1) ret i32 %res @@ -70,11 +65,10 @@ define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) { ; CHECK-LABEL: extract_last_active_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: por %xmm1, %xmm2 ; CHECK-NEXT: psllq $63, %xmm1 -; CHECK-NEXT: movq %xmm2, %rcx ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movq %xmm2, %rcx ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpb $1, %cl @@ -94,28 +88,31 @@ define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) { define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) { ; CHECK-LABEL: extract_last_active_v3i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movd %edx, %xmm0 ; CHECK-NEXT: movd %esi, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,1] -; CHECK-NEXT: pslld $31, %xmm1 -; CHECK-NEXT: psrad $31, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; CHECK-NEXT: movd %edi, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-NEXT: movd %edx, %xmm4 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pslld $31, %xmm3 +; CHECK-NEXT: psrad $31, %xmm3 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %ecx -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: cmpl %ecx, %eax ; CHECK-NEXT: cmoval %eax, %ecx -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: cmpl %eax, %ecx -; CHECK-NEXT: cmovbel %eax, %ecx -; CHECK-NEXT: orl %esi, %edi -; CHECK-NEXT: orl %edx, %edi -; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpb $1, %dil +; CHECK-NEXT: cmpl $0, %ecx +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: por %xmm1, %xmm2 +; CHECK-NEXT: movd %xmm2, %edx +; CHECK-NEXT: andb $1, %dl +; CHECK-NEXT: cmpb $1, %dl ; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax ; CHECK-NEXT: retq @@ -127,21 +124,18 @@ define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) { define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) { ; CHECK-LABEL: extract_last_active_v8i32: ; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: psllw $15, %xmm2 ; CHECK-NEXT: psraw $15, %xmm2 ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: psubusw %xmm2, %xmm0 -; CHECK-NEXT: paddw %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: psubusw %xmm0, %xmm1 -; CHECK-NEXT: paddw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: psubusw %xmm1, %xmm0 -; CHECK-NEXT: paddw %xmm1, %xmm0 -; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movd %xmm1, %ecx ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpb $1, %cl @@ -169,31 +163,18 @@ define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) { define i32 @extract_last_active_v16i32(<16 x i32> %a, <16 x i1> %c) { ; CHECK-LABEL: extract_last_active_v16i32: ; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: psllw $7, %xmm4 -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpgtb %xmm4, %xmm5 +; CHECK-NEXT: pxor %xmm6, %xmm6 +; CHECK-NEXT: pcmpgtb %xmm4, %xmm6 ; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; CHECK-NEXT: pmaxub %xmm5, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: pmaxub %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlw $8, %xmm1 -; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movd %xmm1, %ecx -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpb $1, %cl -; CHECK-NEXT: sbbl %eax, %eax -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; CHECK-NEXT: pmaxub %xmm5, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; CHECK-NEXT: pmaxub %xmm6, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: pmaxub %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -204,6 +185,19 @@ define i32 @extract_last_active_v16i32(<16 x i32> %a, <16 x i1> %c) { ; CHECK-NEXT: pmaxub %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %ecx ; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlw $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movd %xmm0, %edx +; CHECK-NEXT: andb $1, %dl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpb $1, %dl +; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: orl -72(%rsp,%rcx,4), %eax ; CHECK-NEXT: retq %res = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> %a, <16 x i1> %c, i32 -1) @@ -239,9 +233,6 @@ define i8 @extract_last_active_split(<32 x i8> %data, <32 x i8> %mask, i8 %passt ; CHECK-NEXT: por %xmm3, %xmm4 ; CHECK-NEXT: pxor %xmm5, %xmm5 ; CHECK-NEXT: pcmpeqb %xmm5, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: movdqa %xmm3, %xmm7 -; CHECK-NEXT: pxor %xmm6, %xmm7 ; CHECK-NEXT: pcmpeqb %xmm5, %xmm2 ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -258,6 +249,7 @@ define i8 @extract_last_active_split(<32 x i8> %data, <32 x i8> %mask, i8 %passt ; CHECK-NEXT: psrlw $8, %xmm2 ; CHECK-NEXT: pmaxub %xmm1, %xmm2 ; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: pmovmskb %xmm3, %ecx ; CHECK-NEXT: pandn %xmm0, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; CHECK-NEXT: pmaxub %xmm3, %xmm0 @@ -269,37 +261,15 @@ define i8 @extract_last_active_split(<32 x i8> %data, <32 x i8> %mask, i8 %passt ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $8, %xmm1 ; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movd %xmm1, %ecx -; CHECK-NEXT: addl $16, %ecx -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; CHECK-NEXT: pmaxub %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: pmaxub %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlw $8, %xmm1 -; CHECK-NEXT: pmaxub %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %edx -; CHECK-NEXT: testb $1, %dl -; CHECK-NEXT: cmoveq %rax, %rcx -; CHECK-NEXT: andl $31, %ecx -; CHECK-NEXT: movzbl -40(%rsp,%rcx), %eax +; CHECK-NEXT: addl $16, %edx +; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; CHECK-NEXT: cmoveq %rax, %rdx +; CHECK-NEXT: andl $31, %edx +; CHECK-NEXT: movzbl -40(%rsp,%rdx), %eax ; CHECK-NEXT: pcmpeqb %xmm5, %xmm4 -; CHECK-NEXT: pxor %xmm6, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; CHECK-NEXT: pmaxub %xmm4, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: pmaxub %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlw $8, %xmm1 -; CHECK-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-NEXT: movd %xmm1, %ecx -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: pmovmskb %xmm4, %ecx +; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; CHECK-NEXT: cmovel %edi, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 044327d94c0e..f82fa90f78e8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -1146,56 +1146,35 @@ define zeroext i1 @PR44781(ptr %0) { } define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { -; SSE2-LABEL: mask_v3i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB30_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: retq -; SSE2-NEXT: .LBB30_2: -; SSE2-NEXT: movl $1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: mask_v3i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: orl %eax, %ecx -; SSE41-NEXT: pextrd $2, %xmm1, %eax -; SSE41-NEXT: orl %ecx, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je .LBB30_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB30_2: -; SSE41-NEXT: movl $1, %eax -; SSE41-NEXT: retq +; SSE-LABEL: mask_v3i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: je .LBB30_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: retq +; SSE-NEXT: .LBB30_2: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mask_v3i1: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpextrd $1, %xmm0, %eax -; AVX1OR2-NEXT: vmovd %xmm0, %ecx -; AVX1OR2-NEXT: orl %eax, %ecx -; AVX1OR2-NEXT: vpextrd $2, %xmm0, %eax -; AVX1OR2-NEXT: orl %ecx, %eax +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm1 +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1OR2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vmovd %xmm0, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je .LBB30_2 ; AVX1OR2-NEXT: # %bb.1: