llvm-project/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

//===- VPlanUtils.cpp - VPlan-related utilities ---------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "VPlanUtils.h"
#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"

using namespace llvm;
using namespace llvm::VPlanPatternMatch;
using namespace llvm::SCEVPatternMatch;

bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
}

bool vputils::onlyFirstPartUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
}

bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesScalars(Def); });
}

VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
  if (auto *E = dyn_cast<SCEVConstant>(Expr))
    return Plan.getOrAddLiveIn(E->getValue());
  // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
  // value. Otherwise the value may be defined in a loop and using it directly
  // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
  // form.
  auto *U = dyn_cast<SCEVUnknown>(Expr);
  if (U && !isa<Instruction>(U->getValue()))
    return Plan.getOrAddLiveIn(U->getValue());
  auto *Expanded = new VPExpandSCEVRecipe(Expr);
  Plan.getEntry()->appendRecipe(Expanded);
  return Expanded;
}

bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
  if (isa<VPActiveLaneMaskPHIRecipe>(V))
    return true;

  auto IsWideCanonicalIV = [](VPValue *A) {
    return isa<VPWidenCanonicalIVRecipe>(A) ||
           (isa<VPWidenIntOrFpInductionRecipe>(A) &&
            cast<VPWidenIntOrFpInductionRecipe>(A)->isCanonical());
  };

  VPValue *A, *B;

  auto m_CanonicalScalarIVSteps =
      m_ScalarIVSteps(m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()),
                      m_One(), m_Specific(&Plan.getVF()));

  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
    return B == Plan.getTripCount() &&
           (match(A, m_CanonicalScalarIVSteps) || IsWideCanonicalIV(A));

  // For scalar plans, the header mask uses the scalar steps.
  if (match(V, m_ICmp(m_CanonicalScalarIVSteps,
                      m_Specific(Plan.getBackedgeTakenCount())))) {
    assert(Plan.hasScalarVFOnly() &&
           "Non-scalar VF using scalar IV steps for header mask?");
    return true;
  }

  return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
         B == Plan.getBackedgeTakenCount();
}

/// Returns true if \p R propagates poison from any operand to its result.
static bool propagatesPoisonFromRecipeOp(const VPRecipeBase *R) {
  return TypeSwitch<const VPRecipeBase *, bool>(R)
      .Case<VPWidenGEPRecipe, VPWidenCastRecipe>(
          [](const VPRecipeBase *) { return true; })
      .Case([](const VPReplicateRecipe *Rep) {
        // GEP and casts propagate poison from all operands.
        unsigned Opcode = Rep->getOpcode();
        return Opcode == Instruction::GetElementPtr ||
               Instruction::isCast(Opcode);
      })
      .Default([](const VPRecipeBase *) { return false; });
}

/// Returns true if \p V being poison is guaranteed to trigger UB because it
/// propagates to the address of a memory recipe.
static bool poisonGuaranteesUB(const VPValue *V) {
  SmallPtrSet<const VPValue *, 8> Visited;
  SmallVector<const VPValue *, 16> Worklist;

  Worklist.push_back(V);

  while (!Worklist.empty()) {
    const VPValue *Current = Worklist.pop_back_val();
    if (!Visited.insert(Current).second)
      continue;

    for (VPUser *U : Current->users()) {
      // Check if Current is used as an address operand for load/store.
      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
        if (MemR->getAddr() == Current)
          return true;
        continue;
      }
      if (auto *Rep = dyn_cast<VPReplicateRecipe>(U)) {
        unsigned Opcode = Rep->getOpcode();
        if ((Opcode == Instruction::Load && Rep->getOperand(0) == Current) ||
            (Opcode == Instruction::Store && Rep->getOperand(1) == Current))
          return true;
      }

      // Check if poison propagates through this recipe to any of its users.
      auto *R = cast<VPRecipeBase>(U);
      for (const VPValue *Op : R->operands()) {
        if (Op == Current && propagatesPoisonFromRecipeOp(R)) {
          Worklist.push_back(R->getVPSingleValue());
          break;
        }
      }
    }
  }

  return false;
}

const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
                                           PredicatedScalarEvolution &PSE,
                                           const Loop *L) {
  ScalarEvolution &SE = *PSE.getSE();
  if (isa<VPIRValue, VPSymbolicValue>(V)) {
    Value *LiveIn = V->getUnderlyingValue();
    if (LiveIn && SE.isSCEVable(LiveIn->getType()))
      return SE.getSCEV(LiveIn);
    return SE.getCouldNotCompute();
  }

  // Helper to create SCEVs for binary and unary operations.
  auto CreateSCEV =
      [&](ArrayRef<VPValue *> Ops,
          function_ref<const SCEV *(ArrayRef<const SCEV *>)> CreateFn)
      -> const SCEV * {
    SmallVector<const SCEV *, 2> SCEVOps;
    for (VPValue *Op : Ops) {
      const SCEV *S = getSCEVExprForVPValue(Op, PSE, L);
      if (isa<SCEVCouldNotCompute>(S))
        return SE.getCouldNotCompute();
      SCEVOps.push_back(S);
    }
    return CreateFn(SCEVOps);
  };

  VPValue *LHSVal, *RHSVal;
  if (match(V, m_Add(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getAddExpr(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V, m_Sub(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getMinusSCEV(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V, m_Not(m_VPValue(LHSVal)))) {
    // not X = xor X, -1 = -1 - X
    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getMinusSCEV(SE.getMinusOne(Ops[0]->getType()), Ops[0]);
    });
  }
  if (match(V, m_Mul(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getMulExpr(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V,
            m_Binary<Instruction::UDiv>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getUDivExpr(Ops[0], Ops[1]);
    });
  // Handle AND with constant mask: x & (2^n - 1) can be represented as x % 2^n.
  const APInt *Mask;
  if (match(V, m_c_BinaryAnd(m_VPValue(LHSVal), m_APInt(Mask))) &&
      (*Mask + 1).isPowerOf2())
    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getURemExpr(Ops[0], SE.getConstant(*Mask + 1));
    });
  if (match(V, m_Trunc(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getTruncateExpr(Ops[0], DestTy);
    });
  }
  if (match(V, m_ZExt(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getZeroExtendExpr(Ops[0], DestTy);
    });
  }
  if (match(V, m_SExt(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);

    // Mirror SCEV's createSCEV handling for sext(sub nsw): push sign extension
    // onto the operands before computing the subtraction.
    VPValue *SubLHS, *SubRHS;
    auto *SubR = dyn_cast<VPRecipeWithIRFlags>(LHSVal);
    if (match(LHSVal, m_Sub(m_VPValue(SubLHS), m_VPValue(SubRHS))) && SubR &&
        SubR->hasNoSignedWrap() && poisonGuaranteesUB(LHSVal)) {
      const SCEV *V1 = getSCEVExprForVPValue(SubLHS, PSE, L);
      const SCEV *V2 = getSCEVExprForVPValue(SubRHS, PSE, L);
      if (!isa<SCEVCouldNotCompute>(V1) && !isa<SCEVCouldNotCompute>(V2))
        return SE.getMinusSCEV(SE.getSignExtendExpr(V1, DestTy),
                               SE.getSignExtendExpr(V2, DestTy), SCEV::FlagNSW);
    }

    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getSignExtendExpr(Ops[0], DestTy);
    });
  }
  if (match(V,
            m_Intrinsic<Intrinsic::umax>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getUMaxExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::smax>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getSMaxExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::umin>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getUMinExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::smin>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getSMinExpr(Ops[0], Ops[1]);
    });

  ArrayRef<VPValue *> Ops;
  Type *SourceElementType;
  if (match(V, m_GetElementPtr(SourceElementType, Ops))) {
    const SCEV *GEPExpr = CreateSCEV(Ops, [&](ArrayRef<const SCEV *> Ops) {
      return SE.getGEPExpr(Ops.front(), Ops.drop_front(), SourceElementType);
    });
    return PSE.getPredicatedSCEV(GEPExpr);
  }

  // TODO: Support constructing SCEVs for more recipes as needed.
  const VPRecipeBase *DefR = V->getDefiningRecipe();
  const SCEV *Expr =
      TypeSwitch<const VPRecipeBase *, const SCEV *>(DefR)
          .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
          .Case([&SE, &PSE, L](const VPCanonicalIVPHIRecipe *R) {
            if (!L)
              return SE.getCouldNotCompute();
            const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), PSE, L);
            return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
                                    SCEV::FlagAnyWrap);
          })
          .Case([&SE, &PSE, L](const VPWidenIntOrFpInductionRecipe *R) {
            const SCEV *Step = getSCEVExprForVPValue(R->getStepValue(), PSE, L);
            if (!L || isa<SCEVCouldNotCompute>(Step))
              return SE.getCouldNotCompute();
            const SCEV *Start =
                getSCEVExprForVPValue(R->getStartValue(), PSE, L);
            const SCEV *AddRec =
                SE.getAddRecExpr(Start, Step, L, SCEV::FlagAnyWrap);
            if (R->getTruncInst())
              return SE.getTruncateExpr(AddRec, R->getScalarType());
            return AddRec;
          })
          .Case([&SE, &PSE, L](const VPWidenPointerInductionRecipe *R) {
            const SCEV *Start =
                getSCEVExprForVPValue(R->getStartValue(), PSE, L);
            if (!L || isa<SCEVCouldNotCompute>(Start))
              return SE.getCouldNotCompute();
            const SCEV *Step = getSCEVExprForVPValue(R->getStepValue(), PSE, L);
            if (isa<SCEVCouldNotCompute>(Step))
              return SE.getCouldNotCompute();
            return SE.getAddRecExpr(Start, Step, L, SCEV::FlagAnyWrap);
          })
          .Case([&SE, &PSE, L](const VPDerivedIVRecipe *R) {
            const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), PSE, L);
            const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), PSE, L);
            const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), PSE, L);
            if (any_of(ArrayRef({Start, IV, Scale}),
                       IsaPred<SCEVCouldNotCompute>))
              return SE.getCouldNotCompute();

            return SE.getAddExpr(
                SE.getTruncateOrSignExtend(Start, IV->getType()),
                SE.getMulExpr(
                    IV, SE.getTruncateOrSignExtend(Scale, IV->getType())));
          })
          .Case([&SE, &PSE, L](const VPScalarIVStepsRecipe *R) {
            const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), PSE, L);
            const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), PSE, L);
            if (isa<SCEVCouldNotCompute>(IV) || !isa<SCEVConstant>(Step))
              return SE.getCouldNotCompute();
            return SE.getTruncateOrSignExtend(IV, Step->getType());
          })
          .Default(
              [&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });

  return PSE.getPredicatedSCEV(Expr);
}

bool vputils::isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE,
                                   const Loop *L) {
  // If address is an SCEVAddExpr, we require that all operands must be either
  // be invariant or a (possibly sign-extend) affine AddRec.
  if (auto *PtrAdd = dyn_cast<SCEVAddExpr>(Addr)) {
    return all_of(PtrAdd->operands(), [&SE, L](const SCEV *Op) {
      return SE.isLoopInvariant(Op, L) ||
             match(Op, m_scev_SExt(m_scev_AffineAddRec(m_SCEV(), m_SCEV()))) ||
             match(Op, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
    });
  }

  // Otherwise, check if address is loop invariant or an affine add recurrence.
  return SE.isLoopInvariant(Addr, L) ||
         match(Addr, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
}

/// Returns true if \p Opcode preserves uniformity, i.e., if all operands are
/// uniform, the result will also be uniform.
static bool preservesUniformity(unsigned Opcode) {
  if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
    return true;
  switch (Opcode) {
  case Instruction::Freeze:
  case Instruction::GetElementPtr:
  case Instruction::ICmp:
  case Instruction::FCmp:
  case Instruction::Select:
  case VPInstruction::Not:
  case VPInstruction::Broadcast:
  case VPInstruction::PtrAdd:
    return true;
  default:
    return false;
  }
}

bool vputils::isSingleScalar(const VPValue *VPV) {
  // A live-in must be uniform across the scope of VPlan.
  if (isa<VPIRValue, VPSymbolicValue>(VPV))
    return true;

  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
    const VPRegionBlock *RegionOfR = Rep->getRegion();
    // Don't consider recipes in replicate regions as uniform yet; their first
    // lane cannot be accessed when executing the replicate region for other
    // lanes.
    if (RegionOfR && RegionOfR->isReplicator())
      return false;
    return Rep->isSingleScalar() || (preservesUniformity(Rep->getOpcode()) &&
                                     all_of(Rep->operands(), isSingleScalar));
  }
  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe>(VPV))
    return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
  if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
    return preservesUniformity(WidenR->getOpcode()) &&
           all_of(WidenR->operands(), isSingleScalar);
  }
  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
    return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
           (preservesUniformity(VPI->getOpcode()) &&
            all_of(VPI->operands(), isSingleScalar));
  if (auto *RR = dyn_cast<VPReductionRecipe>(VPV))
    return !RR->isPartialReduction();
  if (isa<VPCanonicalIVPHIRecipe, VPVectorPointerRecipe,
          VPVectorEndPointerRecipe>(VPV))
    return true;
  if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
    return Expr->isSingleScalar();

  // VPExpandSCEVRecipes must be placed in the entry and are always uniform.
  return isa<VPExpandSCEVRecipe>(VPV);
}

bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
  // Live-ins are uniform.
  if (isa<VPIRValue, VPSymbolicValue>(V))
    return true;

  VPRecipeBase *R = V->getDefiningRecipe();
  if (R && V->isDefinedOutsideLoopRegions()) {
    if (match(V->getDefiningRecipe(),
              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
      return false;
    return all_of(R->operands(), isUniformAcrossVFsAndUFs);
  }

  auto *CanonicalIV =
      R->getParent()->getEnclosingLoopRegion()->getCanonicalIV();
  // Canonical IV chain is uniform.
  if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
    return true;

  return TypeSwitch<const VPRecipeBase *, bool>(R)
      .Case([](const VPDerivedIVRecipe *R) { return true; })
      .Case([](const VPReplicateRecipe *R) {
        // Be conservative about side-effects, except for the
        // known-side-effecting assumes and stores, which we know will be
        // uniform.
        return R->isSingleScalar() &&
               (!R->mayHaveSideEffects() ||
                isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) &&
               all_of(R->operands(), isUniformAcrossVFsAndUFs);
      })
      .Case([](const VPWidenRecipe *R) {
        return preservesUniformity(R->getOpcode()) &&
               all_of(R->operands(), isUniformAcrossVFsAndUFs);
      })
      .Case([](const VPInstruction *VPI) {
        return (VPI->isScalarCast() &&
                isUniformAcrossVFsAndUFs(VPI->getOperand(0))) ||
               (preservesUniformity(VPI->getOpcode()) &&
                all_of(VPI->operands(), isUniformAcrossVFsAndUFs));
      })
      .Case([](const VPWidenCastRecipe *R) {
        // A cast is uniform according to its operand.
        return isUniformAcrossVFsAndUFs(R->getOperand(0));
      })
      .Default([](const VPRecipeBase *) { // A value is considered non-uniform
                                          // unless proven otherwise.
        return false;
      });
}

VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
  auto DepthFirst = vp_depth_first_shallow(Plan.getEntry());
  auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) {
    return VPBlockUtils::isHeader(VPB, VPDT);
  });
  return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
}

unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
  if (!R)
    return 1;
  if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
    return RR->getVFScaleFactor();
  if (auto *RR = dyn_cast<VPReductionRecipe>(R))
    return RR->getVFScaleFactor();
  if (auto *ER = dyn_cast<VPExpressionRecipe>(R))
    return ER->getVFScaleFactor();
  assert(
      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
                                     VPInstruction::ReductionStartVector) &&
      "getting scaling factor of reduction-start-vector not implemented yet");
  return 1;
}

std::optional<VPValue *>
vputils::getRecipesForUncountableExit(VPlan &Plan,
                                      SmallVectorImpl<VPRecipeBase *> &Recipes,
                                      SmallVectorImpl<VPRecipeBase *> &GEPs) {
  // Given a VPlan like the following (just including the recipes contributing
  // to loop control exiting here, not the actual work), we're looking to match
  // the recipes contributing to the uncountable exit condition comparison
  // (here, vp<%4>) back to either live-ins or the address nodes for the load
  // used as part of the uncountable exit comparison so that we can copy them
  // to a preheader and rotate the address in the loop to the next vector
  // iteration.
  //
  // Currently, the address of the load is restricted to a GEP with 2 operands
  // and a live-in base address. This constraint may be relaxed later.
  //
  // VPlan ' for UF>=1' {
  // Live-in vp<%0> = VF
  // Live-in ir<64> = original trip-count
  //
  // entry:
  // Successor(s): preheader, vector.ph
  //
  // vector.ph:
  // Successor(s): vector loop
  //
  // <x1> vector loop: {
  //   vector.body:
  //     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>
  //     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>, vp<%0>
  //     CLONE ir<%ee.addr> = getelementptr ir<0>, vp<%3>
  //     WIDEN ir<%ee.load> = load ir<%ee.addr>
  //     WIDEN vp<%4> = icmp eq ir<%ee.load>, ir<0>
  //     EMIT vp<%5> = any-of vp<%4>
  //     EMIT vp<%6> = add vp<%2>, vp<%0>
  //     EMIT vp<%7> = icmp eq vp<%6>, ir<64>
  //     EMIT branch-on-two-conds vp<%5>, vp<%7>
  //   No successors
  // }
  // Successor(s): early.exit, middle.block
  //
  // middle.block:
  // Successor(s): preheader
  //
  // preheader:
  // No successors
  // }

  // Find the uncountable loop exit condition.
  auto *Region = Plan.getVectorLoopRegion();
  VPValue *UncountableCondition = nullptr;
  if (!match(Region->getExitingBasicBlock()->getTerminator(),
             m_BranchOnTwoConds(m_AnyOf(m_VPValue(UncountableCondition)),
                                m_VPValue())))
    return std::nullopt;

  SmallVector<VPValue *, 4> Worklist;
  Worklist.push_back(UncountableCondition);
  while (!Worklist.empty()) {
    VPValue *V = Worklist.pop_back_val();

    // Any value defined outside the loop does not need to be copied.
    if (V->isDefinedOutsideLoopRegions())
      continue;

    // FIXME: Remove the single user restriction; it's here because we're
    //        starting with the simplest set of loops we can, and multiple
    //        users means needing to add PHI nodes in the transform.
    if (V->getNumUsers() > 1)
      return std::nullopt;

    VPValue *Op1, *Op2;
    // Walk back through recipes until we find at least one load from memory.
    if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
      Worklist.push_back(Op1);
      Worklist.push_back(Op2);
      Recipes.push_back(V->getDefiningRecipe());
    } else if (auto *Load = dyn_cast<VPWidenLoadRecipe>(V)) {
      // Reject masked loads for the time being; they make the exit condition
      // more complex.
      if (Load->isMasked())
        return std::nullopt;

      VPValue *GEP = Load->getAddr();
      if (!match(GEP, m_GetElementPtr(m_LiveIn(), m_VPValue())))
        return std::nullopt;

      Recipes.push_back(Load);
      Recipes.push_back(GEP->getDefiningRecipe());
      GEPs.push_back(GEP->getDefiningRecipe());
    } else
      return std::nullopt;
  }

  return UncountableCondition;
}

VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) {
  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
  SmallVector<VPValue *> WideCanonicalIVs;
  auto *FoundWidenCanonicalIVUser = find_if(
      LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
  assert(count_if(LoopRegion->getCanonicalIV()->users(),
                  IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
         "Must have at most one VPWideCanonicalIVRecipe");
  if (FoundWidenCanonicalIVUser !=
      LoopRegion->getCanonicalIV()->users().end()) {
    auto *WideCanonicalIV =
        cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
    WideCanonicalIVs.push_back(WideCanonicalIV);
  }

  // Also include VPWidenIntOrFpInductionRecipes that represent a widened
  // version of the canonical induction.
  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
    if (WidenOriginalIV && WidenOriginalIV->isCanonical())
      WideCanonicalIVs.push_back(WidenOriginalIV);
  }

  // Walk users of wide canonical IVs and find the single compare of the form
  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
  VPSingleDefRecipe *HeaderMask = nullptr;
  for (auto *Wide : WideCanonicalIVs) {
    for (VPUser *U : Wide->users()) {
      auto *VPI = dyn_cast<VPInstruction>(U);
      if (!VPI || !vputils::isHeaderMask(VPI, Plan))
        continue;

      assert(VPI->getOperand(0) == Wide &&
             "WidenCanonicalIV must be the first operand of the compare");
      assert(!HeaderMask && "Multiple header masks found?");
      HeaderMask = VPI;
    }
  }
  return HeaderMask;
}

bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
                            const VPDominatorTree &VPDT) {
  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
  if (!VPBB)
    return false;

  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
  // VPBB as its entry, i.e., free of predecessors.
  if (auto *R = VPBB->getParent())
    return !R->isReplicator() && !VPBB->hasPredecessors();

  // A header dominates its second predecessor (the latch), with the other
  // predecessor being the preheader
  return VPB->getPredecessors().size() == 2 &&
         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
}

bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
                           const VPDominatorTree &VPDT) {
  // A latch has a header as its second successor, with its other successor
  // leaving the loop. A preheader OTOH has a header as its first (and only)
  // successor.
  return VPB->getNumSuccessors() == 2 &&
         VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
}

std::optional<MemoryLocation>
vputils::getMemoryLocation(const VPRecipeBase &R) {
  auto *M = dyn_cast<VPIRMetadata>(&R);
  if (!M)
    return std::nullopt;
  MemoryLocation Loc;
  // Populate noalias metadata from VPIRMetadata.
  if (MDNode *NoAliasMD = M->getMetadata(LLVMContext::MD_noalias))
    Loc.AATags.NoAlias = NoAliasMD;
  if (MDNode *AliasScopeMD = M->getMetadata(LLVMContext::MD_alias_scope))
    Loc.AATags.Scope = AliasScopeMD;
  return Loc;
}

/// Find the ComputeReductionResult recipe for \p PhiR, looking through selects
/// inserted for predicated reductions or tail folding.
VPInstruction *vputils::findComputeReductionResult(VPReductionPHIRecipe *PhiR) {
  VPValue *BackedgeVal = PhiR->getBackedgeValue();
  if (auto *Res = vputils::findUserOf<VPInstruction::ComputeReductionResult>(
          BackedgeVal))
    return Res;

  // Look through selects inserted for tail folding or predicated reductions.
  VPRecipeBase *SelR = vputils::findUserOf(
      BackedgeVal, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
  if (!SelR)
    return nullptr;
  return vputils::findUserOf<VPInstruction::ComputeReductionResult>(
      cast<VPSingleDefRecipe>(SelR));
}