Files
llvm-project/llvm/test/Transforms/LoopVectorize/iv-select-cmp-fold-tail.ll
Florian Hahn ec14a1f40c [VPlan] Add transform to replace VPWidenCanonicalIV with wide IV. (#194267)
Add a new cost-based transform that replaces VPWidenCanonicalIVRecipe
with a canonical VPWidenIntOrFpInductionPHIRecipe, if it does not
increase spills.

The main benefit of VPWidenCanonicalIVRecipe is that it has shorter
live-ranges than wide IV phis. The new transform introduces wide IV
unless VPWidenCanonicalIVRecipe is cheaper or the wide IV introduces
additional spills.

This introduces wide IVs in a number of cases, where previously had
VPWidenCanonicalIVRecipe, because there was no existing wide canonical
IV we could re-use. It should also help avoid somewhat unrelated changes
in https://github.com/llvm/llvm-project/pull/190191.

PR: https://github.com/llvm/llvm-project/pull/194267
2026-04-29 19:54:35 +00:00

138 lines
8.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; RUN: opt -p loop-vectorize -tail-folding-policy=must-fold-tail -force-vector-width=4 -force-target-supports-masked-memory-ops -S %s | FileCheck %s
define i32 @find_last_trunc_iv(ptr %src, i64 %n) {
; CHECK-LABEL: define i32 @find_last_trunc_iv(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[LOOP:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY1:.*]]
; CHECK: [[VECTOR_BODY1]]:
; CHECK-NEXT: [[TMP22:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY1]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_BODY]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY1]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY1]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY1]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_BODY]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VECTOR_BODY1]] ]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
; CHECK-NEXT: [[TMP26:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP23]], <4 x i1> [[TMP2]], <4 x i32> poison)
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq <4 x i32> [[TMP26]], zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP29:%.*]] = freeze <4 x i1> [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP29]])
; CHECK-NEXT: [[TMP31]] = select i1 [[TMP30]], <4 x i1> [[TMP28]], <4 x i1> [[TMP1]]
; CHECK-NEXT: [[TMP32]] = select i1 [[TMP30]], <4 x i32> [[VEC_IND1]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[TMP22]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP32]], <4 x i1> [[TMP31]], i32 0)
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]]
;
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
%gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
%l = load i32, ptr %gep.src
%cmp103 = icmp eq i32 %l, 0
%0 = trunc i64 %iv to i32
%rdx.next = select i1 %cmp103, i32 %0, i32 %rdx
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, %n
br i1 %ec, label %exit, label %loop
exit:
ret i32 %rdx.next
}
define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
; CHECK-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], 3
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[TMP3]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP52:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP50:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IV:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP38:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP38]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP39]], i64 -3
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP5]], <4 x i1> [[REVERSE]], <4 x i64> poison)
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i64> [[WIDE_MASKED_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP38]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 -3
; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP7]], <4 x i1> [[REVERSE]], <4 x i64> poison)
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x i64> [[WIDE_MASKED_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP48:%.*]] = icmp sgt <4 x i64> [[TMP46]], [[TMP47]]
; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP52]] = select <4 x i1> [[TMP49]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-NEXT: [[TMP50]] = or <4 x i1> [[VEC_PHI3]], [[TMP49]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <4 x i64> [[VEC_IV]], splat (i64 4)
; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP52]])
; CHECK-NEXT: [[TMP54:%.*]] = add nsw i64 [[TMP53]], -1
; CHECK-NEXT: [[TMP56:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP50]])
; CHECK-NEXT: [[TMP57:%.*]] = freeze i1 [[TMP56]]
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP57]], i64 [[TMP54]], i64 [[RDX_START]]
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret i64 [[RDX_SELECT]]
;
entry:
br label %loop
loop:
%iv = phi i64 [ %iv.next, %loop ], [ %n, %entry ]
%rdx = phi i64 [ %cond, %loop ], [ %rdx.start, %entry ]
%iv.next = add nsw i64 %iv, -1
%gep.a.iv = getelementptr inbounds i64, ptr %a, i64 %iv.next
%ld.a = load i64, ptr %gep.a.iv, align 8
%gep.b.iv = getelementptr inbounds i64, ptr %b, i64 %iv.next
%ld.b = load i64, ptr %gep.b.iv, align 8
%cmp.a.b = icmp sgt i64 %ld.a, %ld.b
%cond = select i1 %cmp.a.b, i64 %iv.next, i64 %rdx
%exit.cond = icmp ugt i64 %iv, 1
br i1 %exit.cond, label %loop, label %exit
exit:
ret i64 %cond
}