Files
David Sherwood 1d9775f684 [LV] Change VPLane::getAsRuntimeExpr to use constant 64-bit indices (#193206)
The canonical form preferred by instcombine is to use 64-bit values for
the index when it is a constant. We should try to do the same where
possible in the loop vectoriser as this reduces churn in the compiler.

It also makes other work easier, such as removing extra unnecessary
passes on the RUN line in the test directory which I plan to do
afterwards.
2026-04-22 11:33:42 +01:00

226 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 5
; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Check the scenario where we have an unknown Stride, which happens to also be
; the loop iteration count, so if we specialize the loop for the Stride==1 case,
; this also implies that the loop will iterate no more than a single iteration,
; as in the following example:
;
; unsigned int N;
; int tmp = 0;
; for(unsigned int k=0;k<N;k++) {
; tmp+=(int)B[k*N+j];
; }
;
; We check here that the following runtime scev guard for Stride==1 is NOT generated:
; vector.scevcheck:
; %ident.check = icmp ne i32 %N, 1
; %0 = or i1 false, %ident.check
; br i1 %0, label %scalar.ph, label %vector.ph
; Instead the loop is vectorized with an unknown stride.
define i32 @foo1(i32 %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i32 %i, i32 %j) {
; CHECK-LABEL: define i32 @foo1(
; CHECK-SAME: i32 [[N:%.*]], ptr readnone captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP8]], [[FOR_END:label %.*]], label %[[FOR_BODY_LR_PH:.*]]
; CHECK: [[FOR_BODY_LR_PH]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[N]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[N]], 0
; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[N]]
; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP0]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[J]], [[MUL_RESULT]]
; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[J]], [[MUL_RESULT]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP4]], [[J]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[J]]
; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i1 [[TMP7]], i1 [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[J]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[BROADCAST_SPLAT3]]
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i64 1
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i64 2
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i64 3
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[TMP13]], align 2
; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[TMP15]], align 2
; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP17]], align 2
; CHECK-NEXT: [[TMP23:%.*]] = load i16, ptr [[TMP19]], align 2
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> poison, i16 [[TMP20]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP21]], i32 1
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> [[TMP25]], i16 [[TMP22]], i32 2
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP23]], i32 3
; CHECK-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[TMP27]] to <4 x i32>
; CHECK-NEXT: [[TMP29]] = add <4 x i32> [[VEC_PHI]], [[TMP28]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP29]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
;
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.end, label %for.body.lr.ph
for.body.lr.ph:
br label %for.body
for.body:
%tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
%k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%mul = mul i32 %k.09, %N
%add = add i32 %mul, %j
%arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
%0 = load i16, ptr %arrayidx, align 2
%conv = sext i16 %0 to i32
%add1 = add nsw i32 %tmp.010, %conv
%inc = add nuw i32 %k.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
%add1.lcssa = phi i32 [ %add1, %for.body ]
br label %for.end
for.end:
%tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
ret i32 %tmp.0.lcssa
}
; Check the same, but also where the Stride and the loop iteration count
; are not of the same data type.
;
; unsigned short N;
; int tmp = 0;
; for(unsigned int k=0;k<N;k++) {
; tmp+=(int)B[k*N+j];
; }
;
; We check here that the following runtime scev guard for Stride==1 is NOT generated:
; vector.scevcheck:
; %ident.check = icmp ne i16 %N, 1
; %0 = or i1 false, %ident.check
; br i1 %0, label %scalar.ph, label %vector.ph
define i32 @foo2(i16 zeroext %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i32 %i, i32 %j) {
; CHECK-LABEL: define i32 @foo2(
; CHECK-SAME: i16 zeroext [[N:%.*]], ptr readnone captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[N]] to i32
; CHECK-NEXT: [[CMP11:%.*]] = icmp eq i16 [[N]], 0
; CHECK-NEXT: br i1 [[CMP11]], [[FOR_END:label %.*]], label %[[FOR_BODY_LR_PH:.*]]
; CHECK: [[FOR_BODY_LR_PH]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[CONV]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[CONV]], -1
; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[CONV]], i32 [[TMP0]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[J]], [[MUL_RESULT]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP1]], [[J]]
; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[CONV]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[CONV]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[J]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[BROADCAST_SPLAT3]]
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i64 1
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i64 2
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i64 3
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[TMP10]], align 2
; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[TMP12]], align 2
; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP14]], align 2
; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[TMP16]], align 2
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[TMP17]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[TMP18]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP19]], i32 2
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[TMP20]], i32 3
; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i16> [[TMP24]] to <4 x i32>
; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[VEC_PHI]], [[TMP25]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP26]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[CONV]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
;
entry:
%conv = zext i16 %N to i32
%cmp11 = icmp eq i16 %N, 0
br i1 %cmp11, label %for.end, label %for.body.lr.ph
for.body.lr.ph:
br label %for.body
for.body:
%tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
%k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%mul = mul nuw i32 %k.012, %conv
%add = add i32 %mul, %j
%arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
%0 = load i16, ptr %arrayidx, align 2
%conv3 = sext i16 %0 to i32
%add4 = add nsw i32 %tmp.013, %conv3
%inc = add nuw nsw i32 %k.012, 1
%exitcond = icmp eq i32 %inc, %conv
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
%add4.lcssa = phi i32 [ %add4, %for.body ]
br label %for.end
for.end:
%tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
ret i32 %tmp.0.lcssa
}