Reapply "[VPlan] Handle calls in VPInstruction:opcodeMayReadOrWriteFromMemory." (#191886)

This reverts commit 3bf9639ec0. The reapply adds trivial support for ExtractValue and InsertValue to fix the crash causing the revert. Original message: Retrieve the called function and check its memory attributes, to determine if a VPInstruction calling a function reads or writes memory. Use it to strengthen assert in areAllLoadsDereferenceable. PR: https://github.com/llvm/llvm-project/pull/190681
2026-04-13 22:43:33 +01:00
parent c9f175bed4
commit ca318abfe6
4 changed files with 175 additions and 10 deletions
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -951,12 +951,14 @@ static bool areAllLoadsDereferenceable(VPBasicBlock *HeaderVPBB,
  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
           vp_depth_first_shallow(HeaderVPBB))) {
    // Skip blocks outside the loop (exit blocks and their successors).
-    if (VPBB == MiddleVPBB)
+    if (VPBB == MiddleVPBB || isa<VPIRBasicBlock>(VPBB))
      continue;
    for (VPRecipeBase &R : *VPBB) {
      auto *VPI = dyn_cast<VPInstructionWithType>(&R);
-      if (!VPI || VPI->getOpcode() != Instruction::Load)
+      if (!VPI || VPI->getOpcode() != Instruction::Load) {
+        assert(!R.mayReadFromMemory() && "unexpected recipe reading memory");
        continue;
+      }

      // Get the pointer SCEV for dereferenceability checking.
      VPValue *Ptr = VPI->getOperand(0);
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -440,6 +440,27 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
         "number of operands does not match opcode");
 }

+/// For call VPInstructions, return the operand index of the called function.
+/// The function is either the last operand (for unmasked calls) or the
+/// second-to-last operand (for masked calls).
+static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
+  assert(VPI.getOpcode() == Instruction::Call && "must be a call");
+  unsigned NumOps = VPI.getNumOperands();
+  auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
+  if (LastOp && isa<Function>(LastOp->getValue()))
+    return NumOps - 1;
+  assert(
+      isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
+      "expected function operand");
+  return NumOps - 2;
+}
+
+/// For call VPInstructions, return the called function.
+static Function *getCalledFunction(const VPInstruction &VPI) {
+  unsigned Idx = getCalledFnOperandIndex(VPI);
+  return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
+}
+
 unsigned VPInstruction::getNumOperandsForOpcode() const {
  if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
    return 1;
@@ -486,14 +507,8 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
  case VPInstruction::ActiveLaneMask:
  case VPInstruction::ReductionStartVector:
    return 3;
-  case Instruction::Call: {
-    // For unmasked calls, the last argument will the called function. Use that
-    // to compute the number of operands without the mask.
-    VPValue *LastOp = getOperand(getNumOperands() - 1);
-    if (isa<VPIRValue>(LastOp) && isa<Function>(LastOp->getLiveInIRValue()))
-      return getNumOperands();
-    return getNumOperands() - 1;
-  }
+  case Instruction::Call:
+    return getCalledFnOperandIndex(*this) + 1;
  case Instruction::GetElementPtr:
  case Instruction::PHI:
  case Instruction::Switch:
@@ -1318,6 +1333,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
      Instruction::isUnaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
    return false;
  switch (getOpcode()) {
+  case Instruction::ExtractValue:
+  case Instruction::InsertValue:
  case Instruction::GetElementPtr:
  case Instruction::ExtractElement:
  case Instruction::Freeze:
@@ -1358,6 +1375,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
  case VPInstruction::VScale:
  case VPInstruction::Unpack:
    return false;
+  case Instruction::Call:
+    return !getCalledFunction(*this)->doesNotAccessMemory();
  default:
    return true;
  }
--- a/llvm/test/Transforms/LoopVectorize/early-exit-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/early-exit-calls.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i32 @early_exit_with_extractvalue(ptr dereferenceable(1024) align 8 %src, i32 noundef %x) {
+; CHECK-LABEL: define i32 @early_exit_with_extractvalue(
+; CHECK-SAME: ptr align 8 dereferenceable(1024) [[SRC:%.*]], i32 noundef [[X:%.*]]) {
+; CHECK-NEXT:  [[LOOP_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[LATCH:.*]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[LOOP]] ], [ [[IV_NEXT:%.*]], %[[EARLY_EXIT:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[LOOP]] ], [ [[VEC_IND_NEXT:%.*]], %[[EARLY_EXIT]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> [[VEC_IND]], <4 x i32> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP1]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[IV_NEXT]], 60
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[EARLY_EXIT]]
+; CHECK:       [[EARLY_EXIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = add i32 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    br label %[[EARLY_EXIT1:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 60, %[[EXIT_LOOPEXIT]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MUL_OV:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[IV]], i32 [[X]])
+; CHECK-NEXT:    [[MUL_OV_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL_OV]], 1
+; CHECK-NEXT:    br i1 [[MUL_OV_OVERFLOW]], label %[[EARLY_EXIT1]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT1]], 63
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EARLY_EXIT1]]:
+; CHECK-NEXT:    [[IV_LCSSA2:%.*]] = phi i32 [ [[IV]], %[[LOOP_HEADER]] ], [ [[IV_LCSSA1]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA2]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[VAL]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.ov = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %iv, i32 %x)
+  %mul.ov.overflow = extractvalue { i32, i1 } %mul.ov, 1
+  br i1 %mul.ov.overflow, label %early.exit, label %loop.latch
+
+loop.latch:
+  %gep = getelementptr inbounds i32, ptr %src, i32 %iv
+  %val = load i32, ptr %gep, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 63
+  br i1 %ec, label %exit, label %loop.header
+
+early.exit:
+  ret i32 %iv
+
+exit:
+  ret i32 %val
+}
--- a/llvm/test/Transforms/LoopVectorize/early-exit-unary-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/early-exit-unary-ops.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i64 @early_exit_with_fneg(ptr dereferenceable(1024) align 8 %src, i1 %cond) {
+; CHECK-LABEL: define i64 @early_exit_with_fneg(
+; CHECK-SAME: ptr align 8 dereferenceable(1024) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[LOOP:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY1:.*]]
+; CHECK:       [[VECTOR_BODY1]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_INTERIM:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x double> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <4 x double> [[TMP1]], splat (double 1.000000e+01)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 124
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[VECTOR_BODY_INTERIM]]
+; CHECK:       [[VECTOR_BODY_INTERIM]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[IV]], [[TMP6]]
+; CHECK-NEXT:    br label %[[EARLY_EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 124, %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[VAL:%.*]] = load double, ptr [[GEP1]], align 8
+; CHECK-NEXT:    [[NEG:%.*]] = fneg double [[VAL]]
+; CHECK-NEXT:    [[C_1:%.*]] = fcmp une double [[NEG]], 1.000000e+01
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LATCH]], label %[[EARLY_EXIT]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 127
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EARLY_EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
+  %gep = getelementptr inbounds double, ptr %src, i64 %iv
+  %val = load double, ptr %gep, align 8
+  %neg = fneg double %val
+  %c.1 = fcmp une double %neg, 10.0
+  br i1 %c.1, label %latch, label %early.exit
+
+latch:
+  %iv.next = add nuw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, 127
+  br i1 %exit.cond, label %exit, label %loop.header
+
+early.exit:
+  ret i64 %iv
+
+exit:
+  ret i64 10
+}