llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains implementations for different VPlan recipes.
///
//===----------------------------------------------------------------------===//

#include "LoopVectorizationPlanner.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanHelpers.h"
#include "VPlanPatternMatch.h"
#include "VPlanUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>

using namespace llvm;
using namespace llvm::VPlanPatternMatch;

using VectorParts = SmallVector<Value *, 2>;

#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME

bool VPRecipeBase::mayWriteToMemory() const {
  switch (getVPRecipeID()) {
  case VPExpressionSC:
    return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
  case VPInstructionSC: {
    auto *VPI = cast<VPInstruction>(this);
    // Loads read from memory but don't write to memory.
    if (VPI->getOpcode() == Instruction::Load)
      return false;
    return VPI->opcodeMayReadOrWriteFromMemory();
  }
  case VPInterleaveEVLSC:
  case VPInterleaveSC:
    return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
  case VPWidenStoreEVLSC:
  case VPWidenStoreSC:
    return true;
  case VPReplicateSC:
    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
        ->mayWriteToMemory();
  case VPWidenCallSC:
    return !cast<VPWidenCallRecipe>(this)
                ->getCalledScalarFunction()
                ->onlyReadsMemory();
  case VPWidenIntrinsicSC:
    return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
  case VPActiveLaneMaskPHISC:
  case VPCurrentIterationPHISC:
  case VPBranchOnMaskSC:
  case VPDerivedIVSC:
  case VPFirstOrderRecurrencePHISC:
  case VPReductionPHISC:
  case VPScalarIVStepsSC:
  case VPPredInstPHISC:
    return false;
  case VPBlendSC:
  case VPReductionEVLSC:
  case VPReductionSC:
  case VPVectorPointerSC:
  case VPWidenCanonicalIVSC:
  case VPWidenCastSC:
  case VPWidenGEPSC:
  case VPWidenIntOrFpInductionSC:
  case VPWidenLoadEVLSC:
  case VPWidenLoadSC:
  case VPWidenPHISC:
  case VPWidenPointerInductionSC:
  case VPWidenSC: {
    const Instruction *I =
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
    (void)I;
    assert((!I || !I->mayWriteToMemory()) &&
           "underlying instruction may write to memory");
    return false;
  }
  default:
    return true;
  }
}

bool VPRecipeBase::mayReadFromMemory() const {
  switch (getVPRecipeID()) {
  case VPExpressionSC:
    return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
  case VPInstructionSC:
    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
  case VPWidenLoadEVLSC:
  case VPWidenLoadSC:
    return true;
  case VPReplicateSC:
    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
        ->mayReadFromMemory();
  case VPWidenCallSC:
    return !cast<VPWidenCallRecipe>(this)
                ->getCalledScalarFunction()
                ->onlyWritesMemory();
  case VPWidenIntrinsicSC:
    return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
  case VPBranchOnMaskSC:
  case VPDerivedIVSC:
  case VPCurrentIterationPHISC:
  case VPFirstOrderRecurrencePHISC:
  case VPReductionPHISC:
  case VPPredInstPHISC:
  case VPScalarIVStepsSC:
  case VPWidenStoreEVLSC:
  case VPWidenStoreSC:
    return false;
  case VPBlendSC:
  case VPReductionEVLSC:
  case VPReductionSC:
  case VPVectorPointerSC:
  case VPWidenCanonicalIVSC:
  case VPWidenCastSC:
  case VPWidenGEPSC:
  case VPWidenIntOrFpInductionSC:
  case VPWidenPHISC:
  case VPWidenPointerInductionSC:
  case VPWidenSC: {
    const Instruction *I =
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
    (void)I;
    assert((!I || !I->mayReadFromMemory()) &&
           "underlying instruction may read from memory");
    return false;
  }
  default:
    // FIXME: Return false if the recipe represents an interleaved store.
    return true;
  }
}

bool VPRecipeBase::mayHaveSideEffects() const {
  switch (getVPRecipeID()) {
  case VPExpressionSC:
    return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
  case VPActiveLaneMaskPHISC:
  case VPDerivedIVSC:
  case VPCurrentIterationPHISC:
  case VPFirstOrderRecurrencePHISC:
  case VPReductionPHISC:
  case VPPredInstPHISC:
  case VPVectorEndPointerSC:
    return false;
  case VPInstructionSC: {
    auto *VPI = cast<VPInstruction>(this);
    return mayWriteToMemory() ||
           VPI->getOpcode() == VPInstruction::BranchOnCount ||
           VPI->getOpcode() == VPInstruction::BranchOnCond ||
           VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
  }
  case VPWidenCallSC: {
    Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
    return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
  }
  case VPWidenIntrinsicSC:
    return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
  case VPBlendSC:
  case VPReductionEVLSC:
  case VPReductionSC:
  case VPScalarIVStepsSC:
  case VPVectorPointerSC:
  case VPWidenCanonicalIVSC:
  case VPWidenCastSC:
  case VPWidenGEPSC:
  case VPWidenIntOrFpInductionSC:
  case VPWidenPHISC:
  case VPWidenPointerInductionSC:
  case VPWidenSC: {
    const Instruction *I =
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
    (void)I;
    assert((!I || !I->mayHaveSideEffects()) &&
           "underlying instruction has side-effects");
    return false;
  }
  case VPInterleaveEVLSC:
  case VPInterleaveSC:
    return mayWriteToMemory();
  case VPWidenLoadEVLSC:
  case VPWidenLoadSC:
  case VPWidenStoreEVLSC:
  case VPWidenStoreSC:
    assert(
        cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
            mayWriteToMemory() &&
        "mayHaveSideffects result for ingredient differs from this "
        "implementation");
    return mayWriteToMemory();
  case VPReplicateSC: {
    auto *R = cast<VPReplicateRecipe>(this);
    return R->getUnderlyingInstr()->mayHaveSideEffects();
  }
  default:
    return true;
  }
}

void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
  assert(!Parent && "Recipe already in some VPBasicBlock");
  assert(InsertPos->getParent() &&
         "Insertion position not in any VPBasicBlock");
  InsertPos->getParent()->insert(this, InsertPos->getIterator());
}

void VPRecipeBase::insertBefore(VPBasicBlock &BB,
                                iplist<VPRecipeBase>::iterator I) {
  assert(!Parent && "Recipe already in some VPBasicBlock");
  assert(I == BB.end() || I->getParent() == &BB);
  BB.insert(this, I);
}

void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
  assert(!Parent && "Recipe already in some VPBasicBlock");
  assert(InsertPos->getParent() &&
         "Insertion position not in any VPBasicBlock");
  InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
}

void VPRecipeBase::removeFromParent() {
  assert(getParent() && "Recipe not in any VPBasicBlock");
  getParent()->getRecipeList().remove(getIterator());
  Parent = nullptr;
}

iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
  assert(getParent() && "Recipe not in any VPBasicBlock");
  return getParent()->getRecipeList().erase(getIterator());
}

void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
  removeFromParent();
  insertAfter(InsertPos);
}

void VPRecipeBase::moveBefore(VPBasicBlock &BB,
                              iplist<VPRecipeBase>::iterator I) {
  removeFromParent();
  insertBefore(BB, I);
}

InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
  // Get the underlying instruction for the recipe, if there is one. It is used
  // to
  //   * decide if cost computation should be skipped for this recipe,
  //   * apply forced target instruction cost.
  Instruction *UI = nullptr;
  if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
  else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
    UI = IG->getInsertPos();
  else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
    UI = &WidenMem->getIngredient();

  InstructionCost RecipeCost;
  if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
    RecipeCost = 0;
  } else {
    RecipeCost = computeCost(VF, Ctx);
    if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
        RecipeCost.isValid()) {
      if (UI)
        RecipeCost = InstructionCost(ForceTargetInstructionCost);
      else
        RecipeCost = InstructionCost(0);
    }
  }

  LLVM_DEBUG({
    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
    dump();
  });
  return RecipeCost;
}

InstructionCost VPRecipeBase::computeCost(ElementCount VF,
                                          VPCostContext &Ctx) const {
  llvm_unreachable("subclasses should implement computeCost");
}

bool VPRecipeBase::isPhi() const {
  return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
         isa<VPPhi, VPIRPhi>(this);
}

bool VPRecipeBase::isScalarCast() const {
  auto *VPI = dyn_cast<VPInstruction>(this);
  return VPI && Instruction::isCast(VPI->getOpcode());
}

void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
  assert(OpType == Other.OpType && "OpType must match");
  switch (OpType) {
  case OperationType::OverflowingBinOp:
    WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
    WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
    break;
  case OperationType::Trunc:
    TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
    TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
    break;
  case OperationType::DisjointOp:
    DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
    break;
  case OperationType::PossiblyExactOp:
    ExactFlags.IsExact &= Other.ExactFlags.IsExact;
    break;
  case OperationType::GEPOp:
    GEPFlagsStorage &= Other.GEPFlagsStorage;
    break;
  case OperationType::FPMathOp:
  case OperationType::FCmp:
    assert((OpType != OperationType::FCmp ||
            FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
           "Cannot drop CmpPredicate");
    getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
    getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
    break;
  case OperationType::NonNegOp:
    NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
    break;
  case OperationType::Cmp:
    assert(CmpPredStorage == Other.CmpPredStorage &&
           "Cannot drop CmpPredicate");
    break;
  case OperationType::ReductionOp:
    assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
           "Cannot change RecurKind");
    assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
           "Cannot change IsOrdered");
    assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
           "Cannot change IsInLoop");
    getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
    getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
    break;
  case OperationType::Other:
    break;
  }
}

FastMathFlags VPIRFlags::getFastMathFlags() const {
  assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
          OpType == OperationType::ReductionOp ||
          OpType == OperationType::Other) &&
         "recipe doesn't have fast math flags");
  if (OpType == OperationType::Other)
    return FastMathFlags();
  const FastMathFlagsTy &F = getFMFsRef();
  FastMathFlags Res;
  Res.setAllowReassoc(F.AllowReassoc);
  Res.setNoNaNs(F.NoNaNs);
  Res.setNoInfs(F.NoInfs);
  Res.setNoSignedZeros(F.NoSignedZeros);
  Res.setAllowReciprocal(F.AllowReciprocal);
  Res.setAllowContract(F.AllowContract);
  Res.setApproxFunc(F.ApproxFunc);
  return Res;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }

void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
                         VPSlotTracker &SlotTracker) const {
  printRecipe(O, Indent, SlotTracker);
  if (auto DL = getDebugLoc()) {
    O << ", !dbg ";
    DL.print(O);
  }

  if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
    Metadata->print(O, SlotTracker);
}
#endif

template <unsigned PartOpIdx>
VPValue *
VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
  if (U.getNumOperands() == PartOpIdx + 1)
    return U.getOperand(PartOpIdx);
  return nullptr;
}

template <unsigned PartOpIdx>
unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
  if (auto *UnrollPartOp = getUnrollPartOperand(U))
    return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
  return 0;
}

namespace llvm {
template class VPUnrollPartAccessor<1>;
template class VPUnrollPartAccessor<2>;
template class VPUnrollPartAccessor<3>;
}

VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                             const VPIRFlags &Flags, const VPIRMetadata &MD,
                             DebugLoc DL, const Twine &Name)
    : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
      VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
  assert(flagsValidForOpcode(getOpcode()) &&
         "Set flags not supported for the provided opcode");
  assert(hasRequiredFlagsForOpcode(getOpcode()) &&
         "Opcode requires specific flags to be set");
  assert((getNumOperandsForOpcode() == -1u ||
          getNumOperandsForOpcode() == getNumOperands() ||
          (isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) &&
         "number of operands does not match opcode");
}

/// For call VPInstructions, return the operand index of the called function.
/// The function is either the last operand (for unmasked calls) or the
/// second-to-last operand (for masked calls).
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
  assert(VPI.getOpcode() == Instruction::Call && "must be a call");
  unsigned NumOps = VPI.getNumOperands();
  auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
  if (LastOp && isa<Function>(LastOp->getValue()))
    return NumOps - 1;
  assert(
      isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
      "expected function operand");
  return NumOps - 2;
}

/// For call VPInstructions, return the called function.
static Function *getCalledFunction(const VPInstruction &VPI) {
  unsigned Idx = getCalledFnOperandIndex(VPI);
  return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
}

unsigned VPInstruction::getNumOperandsForOpcode() const {
  if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
    return 1;

  if (Instruction::isBinaryOp(Opcode))
    return 2;

  switch (Opcode) {
  case VPInstruction::StepVector:
  case VPInstruction::VScale:
    return 0;
  case Instruction::Alloca:
  case Instruction::ExtractValue:
  case Instruction::Freeze:
  case Instruction::Load:
  case VPInstruction::BranchOnCond:
  case VPInstruction::Broadcast:
  case VPInstruction::ExitingIVValue:
  case VPInstruction::ExplicitVectorLength:
  case VPInstruction::ExtractLastLane:
  case VPInstruction::ExtractLastPart:
  case VPInstruction::ExtractPenultimateElement:
  case VPInstruction::MaskedCond:
  case VPInstruction::Not:
  case VPInstruction::ResumeForEpilogue:
  case VPInstruction::Reverse:
  case VPInstruction::Unpack:
    return 1;
  case Instruction::ICmp:
  case Instruction::FCmp:
  case Instruction::ExtractElement:
  case Instruction::Store:
  case VPInstruction::BranchOnCount:
  case VPInstruction::BranchOnTwoConds:
  case VPInstruction::FirstOrderRecurrenceSplice:
  case VPInstruction::LogicalAnd:
  case VPInstruction::LogicalOr:
  case VPInstruction::PtrAdd:
  case VPInstruction::WidePtrAdd:
  case VPInstruction::WideIVStep:
  case VPInstruction::CalculateTripCountMinusVF:
    return 2;
  case Instruction::Select:
  case VPInstruction::ActiveLaneMask:
  case VPInstruction::ReductionStartVector:
    return 3;
  case Instruction::Call:
    return getCalledFnOperandIndex(*this) + 1;
  case Instruction::GetElementPtr:
  case Instruction::PHI:
  case Instruction::Switch:
  case VPInstruction::AnyOf:
  case VPInstruction::BuildStructVector:
  case VPInstruction::BuildVector:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::ComputeReductionResult:
  case VPInstruction::FirstActiveLane:
  case VPInstruction::LastActiveLane:
  case VPInstruction::ExtractLane:
  case VPInstruction::ExtractLastActive:
    // Cannot determine the number of operands from the opcode.
    return -1u;
  }
  llvm_unreachable("all cases should be handled above");
}

bool VPInstruction::doesGeneratePerAllLanes() const {
  return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
}

bool VPInstruction::canGenerateScalarForFirstLane() const {
  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
    return true;
  if (isSingleScalar() || isVectorToScalar())
    return true;
  switch (Opcode) {
  case Instruction::Freeze:
  case Instruction::ICmp:
  case Instruction::PHI:
  case Instruction::Select:
  case VPInstruction::BranchOnCond:
  case VPInstruction::BranchOnTwoConds:
  case VPInstruction::BranchOnCount:
  case VPInstruction::CalculateTripCountMinusVF:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::PtrAdd:
  case VPInstruction::ExplicitVectorLength:
  case VPInstruction::AnyOf:
  case VPInstruction::Not:
    return true;
  default:
    return false;
  }
}

Value *VPInstruction::generate(VPTransformState &State) {
  IRBuilderBase &Builder = State.Builder;

  if (Instruction::isBinaryOp(getOpcode())) {
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
    Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
    auto *Res =
        Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
    if (auto *I = dyn_cast<Instruction>(Res))
      applyFlags(*I);
    return Res;
  }

  switch (getOpcode()) {
  case VPInstruction::Not: {
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
    return Builder.CreateNot(A, Name);
  }
  case Instruction::ExtractElement: {
    assert(State.VF.isVector() && "Only extract elements from vectors");
    if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
      return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
    Value *Vec = State.get(getOperand(0));
    Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
    return Builder.CreateExtractElement(Vec, Idx, Name);
  }
  case Instruction::Freeze: {
    Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
    return Builder.CreateFreeze(Op, Name);
  }
  case Instruction::FCmp:
  case Instruction::ICmp: {
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
    Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
    return Builder.CreateCmp(getPredicate(), A, B, Name);
  }
  case Instruction::PHI: {
    llvm_unreachable("should be handled by VPPhi::execute");
  }
  case Instruction::Select: {
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
    Value *Cond =
        State.get(getOperand(0),
                  OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
    Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
    Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
    return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
  }
  case VPInstruction::ActiveLaneMask: {
    // Get first lane of vector induction variable.
    Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
    // Get the original loop tripcount.
    Value *ScalarTC = State.get(getOperand(1), VPLane(0));

    // If this part of the active lane mask is scalar, generate the CMP directly
    // to avoid unnecessary extracts.
    if (State.VF.isScalar())
      return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
                               Name);

    ElementCount EC = State.VF.multiplyCoefficientBy(
        cast<VPConstantInt>(getOperand(2))->getZExtValue());
    auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                   {PredTy, ScalarTC->getType()},
                                   {VIVElem0, ScalarTC}, nullptr, Name);
  }
  case VPInstruction::FirstOrderRecurrenceSplice: {
    // Generate code to combine the previous and current values in vector v3.
    //
    //   vector.ph:
    //     v_init = vector(..., ..., ..., a[-1])
    //     br vector.body
    //
    //   vector.body
    //     i = phi [0, vector.ph], [i+4, vector.body]
    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
    //     v2 = a[i, i+1, i+2, i+3];
    //     v3 = vector(v1(3), v2(0, 1, 2))

    auto *V1 = State.get(getOperand(0));
    if (!V1->getType()->isVectorTy())
      return V1;
    Value *V2 = State.get(getOperand(1));
    return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
  }
  case VPInstruction::CalculateTripCountMinusVF: {
    Value *ScalarTC = State.get(getOperand(0), VPLane(0));
    Value *VFxUF = State.get(getOperand(1), VPLane(0));
    Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
    Value *Cmp =
        Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
    Value *Zero = ConstantInt::getNullValue(ScalarTC->getType());
    return Builder.CreateSelect(Cmp, Sub, Zero);
  }
  case VPInstruction::ExplicitVectorLength: {
    // TODO: Restructure this code with an explicit remainder loop, vsetvli can
    // be outside of the main loop.
    Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
    // Compute EVL
    assert(AVL->getType()->isIntegerTy() &&
           "Requested vector length should be an integer.");

    assert(State.VF.isScalable() && "Expected scalable vector factor.");
    Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());

    Value *EVL = Builder.CreateIntrinsic(
        Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
        {AVL, VFArg, Builder.getTrue()});
    return EVL;
  }
  case VPInstruction::BranchOnCond: {
    Value *Cond = State.get(getOperand(0), VPLane(0));
    // Replace the temporary unreachable terminator with a new conditional
    // branch, hooking it up to backward destination for latch blocks now, and
    // to forward destination(s) later when they are created.
    // Second successor may be backwards - iff it is already in VPBB2IRBB.
    VPBasicBlock *SecondVPSucc =
        cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
    BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
    BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
    auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
    // First successor is always forward, reset it to nullptr.
    Br->setSuccessor(0, nullptr);
    IRBB->getTerminator()->eraseFromParent();
    applyMetadata(*Br);
    return Br;
  }
  case VPInstruction::Broadcast: {
    return Builder.CreateVectorSplat(
        State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
  }
  case VPInstruction::BuildStructVector: {
    // For struct types, we need to build a new 'wide' struct type, where each
    // element is widened, i.e., we create a struct of vectors.
    auto *StructTy =
        cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
    Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
    for (const auto &[LaneIndex, Op] : enumerate(operands())) {
      for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
           FieldIndex++) {
        Value *ScalarValue =
            Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
        Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
        VectorValue =
            Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
        Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
      }
    }
    return Res;
  }
  case VPInstruction::BuildVector: {
    auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
    auto NumOfElements = ElementCount::getFixed(getNumOperands());
    Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
    for (const auto &[Idx, Op] : enumerate(operands()))
      Res = Builder.CreateInsertElement(Res, State.get(Op, true),
                                        Builder.getInt32(Idx));
    return Res;
  }
  case VPInstruction::ReductionStartVector: {
    if (State.VF.isScalar())
      return State.get(getOperand(0), true);
    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
    Builder.setFastMathFlags(getFastMathFlags());
    // If this start vector is scaled then it should produce a vector with fewer
    // elements than the VF.
    ElementCount VF = State.VF.divideCoefficientBy(
        cast<VPConstantInt>(getOperand(2))->getZExtValue());
    auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
    return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
                                       Builder.getInt32(0));
  }
  case VPInstruction::ComputeReductionResult: {
    RecurKind RK = getRecurKind();
    bool IsOrdered = isReductionOrdered();
    bool IsInLoop = isReductionInLoop();
    assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
           "FindIV should use min/max reduction kinds");

    // The recipe may have multiple operands to be reduced together.
    unsigned NumOperandsToReduce = getNumOperands();
    VectorParts RdxParts(NumOperandsToReduce);
    for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
      RdxParts[Part] = State.get(getOperand(Part), IsInLoop);

    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
    Builder.setFastMathFlags(getFastMathFlags());

    // Reduce multiple operands into one.
    Value *ReducedPartRdx = RdxParts[0];
    if (IsOrdered) {
      ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
    } else {
      // Floating-point operations should have some FMF to enable the reduction.
      for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
        Value *RdxPart = RdxParts[Part];
        if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
        else {
          // For sub-recurrences, each part's reduction variable is already
          // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
          Instruction::BinaryOps Opcode =
              RK == RecurKind::Sub
                  ? Instruction::Add
                  : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
          ReducedPartRdx =
              Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
        }
      }
    }

    // Create the reduction after the loop. Note that inloop reductions create
    // the target reduction in the loop using a Reduction recipe.
    if (State.VF.isVector() && !IsInLoop) {
      // TODO: Support in-order reductions based on the recurrence descriptor.
      // All ops in the reduction inherit fast-math-flags from the recurrence
      // descriptor.
      ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
    }

    return ReducedPartRdx;
  }
  case VPInstruction::ExtractLastLane:
  case VPInstruction::ExtractPenultimateElement: {
    unsigned Offset =
        getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
    Value *Res;
    if (State.VF.isVector()) {
      assert(Offset <= State.VF.getKnownMinValue() &&
             "invalid offset to extract from");
      // Extract lane VF - Offset from the operand.
      Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
    } else {
      // TODO: Remove ExtractLastLane for scalar VFs.
      assert(Offset <= 1 && "invalid offset to extract from");
      Res = State.get(getOperand(0));
    }
    if (isa<ExtractElementInst>(Res))
      Res->setName(Name);
    return Res;
  }
  case VPInstruction::LogicalAnd: {
    Value *A = State.get(getOperand(0));
    Value *B = State.get(getOperand(1));
    return Builder.CreateLogicalAnd(A, B, Name);
  }
  case VPInstruction::LogicalOr: {
    Value *A = State.get(getOperand(0));
    Value *B = State.get(getOperand(1));
    return Builder.CreateLogicalOr(A, B, Name);
  }
  case VPInstruction::PtrAdd: {
    assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
           "can only generate first lane for PtrAdd");
    Value *Ptr = State.get(getOperand(0), VPLane(0));
    Value *Addend = State.get(getOperand(1), VPLane(0));
    return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
  }
  case VPInstruction::WidePtrAdd: {
    Value *Ptr =
        State.get(getOperand(0), vputils::isSingleScalar(getOperand(0)));
    Value *Addend = State.get(getOperand(1));
    return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
  }
  case VPInstruction::AnyOf: {
    Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
    for (VPValue *Op : drop_begin(operands()))
      Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
    return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
  }
  case VPInstruction::ExtractLane: {
    assert(getNumOperands() != 2 && "ExtractLane from single source should be "
                                    "simplified to ExtractElement.");
    Value *LaneToExtract = State.get(getOperand(0), true);
    Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
    Value *Res = nullptr;
    Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);

    for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
      Value *VectorStart =
          Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
      Value *VectorIdx = Idx == 1
                             ? LaneToExtract
                             : Builder.CreateSub(LaneToExtract, VectorStart);
      Value *Ext = State.VF.isScalar()
                       ? State.get(getOperand(Idx))
                       : Builder.CreateExtractElement(
                             State.get(getOperand(Idx)), VectorIdx);
      if (Res) {
        Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
        Res = Builder.CreateSelect(Cmp, Ext, Res);
      } else {
        Res = Ext;
      }
    }
    return Res;
  }
  case VPInstruction::FirstActiveLane: {
    Type *Ty = State.TypeAnalysis.inferScalarType(this);
    if (getNumOperands() == 1) {
      Value *Mask = State.get(getOperand(0));
      return Builder.CreateCountTrailingZeroElems(Ty, Mask,
                                                  /*ZeroIsPoison=*/false, Name);
    }
    // If there are multiple operands, create a chain of selects to pick the
    // first operand with an active lane and add the number of lanes of the
    // preceding operands.
    Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
    unsigned LastOpIdx = getNumOperands() - 1;
    Value *Res = nullptr;
    for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
      Value *TrailingZeros =
          State.VF.isScalar()
              ? Builder.CreateZExt(
                    Builder.CreateICmpEQ(State.get(getOperand(Idx)),
                                         Builder.getFalse()),
                    Ty)
              : Builder.CreateCountTrailingZeroElems(
                    Ty, State.get(getOperand(Idx)),
                    /*ZeroIsPoison=*/false, Name);
      Value *Current = Builder.CreateAdd(
          Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
          TrailingZeros);
      if (Res) {
        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
        Res = Builder.CreateSelect(Cmp, Current, Res);
      } else {
        Res = Current;
      }
    }

    return Res;
  }
  case VPInstruction::ResumeForEpilogue:
    return State.get(getOperand(0), true);
  case VPInstruction::Reverse:
    return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
  case VPInstruction::ExtractLastActive: {
    Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
    for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
      Value *Data = State.get(getOperand(Idx));
      Value *Mask = State.get(getOperand(Idx + 1));
      Type *VTy = Data->getType();

      if (State.VF.isScalar())
        Result = Builder.CreateSelect(Mask, Data, Result);
      else
        Result = Builder.CreateIntrinsic(
            Intrinsic::experimental_vector_extract_last_active, {VTy},
            {Data, Mask, Result});
    }

    return Result;
  }
  default:
    llvm_unreachable("Unsupported opcode for instruction");
  }
}

InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
    unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
  Type *ScalarTy = Ctx.Types.inferScalarType(this);
  Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
  switch (Opcode) {
  case Instruction::FNeg:
    return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::SRem:
  case Instruction::URem:
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::FDiv:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor: {
    // Certain instructions can be cheaper if they have a constant second
    // operand. One example of this are shifts on x86.
    VPValue *RHS = getOperand(1);
    TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);

    if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
        getOperand(1)->isDefinedOutsideLoopRegions())
      RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;

    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
    SmallVector<const Value *, 4> Operands;
    if (CtxI)
      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
    return Ctx.TTI.getArithmeticInstrCost(
        Opcode, ResultTy, Ctx.CostKind,
        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
        RHSInfo, Operands, CtxI, &Ctx.TLI);
  }
  case Instruction::Freeze:
    // This opcode is unknown. Assume that it is the same as 'mul'.
    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
                                          Ctx.CostKind);
  case Instruction::ExtractValue:
    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                             Ctx.CostKind);
  case Instruction::ICmp:
  case Instruction::FCmp: {
    Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
    Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
    return Ctx.TTI.getCmpSelInstrCost(
        Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
        Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
        {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
  }
  case Instruction::BitCast: {
    Type *ScalarTy = Ctx.Types.inferScalarType(this);
    if (ScalarTy->isPointerTy())
      return 0;
    [[fallthrough]];
  }
  case Instruction::SExt:
  case Instruction::ZExt:
  case Instruction::FPToUI:
  case Instruction::FPToSI:
  case Instruction::FPExt:
  case Instruction::PtrToInt:
  case Instruction::PtrToAddr:
  case Instruction::IntToPtr:
  case Instruction::SIToFP:
  case Instruction::UIToFP:
  case Instruction::Trunc:
  case Instruction::FPTrunc:
  case Instruction::AddrSpaceCast: {
    // Computes the CastContextHint from a recipe that may access memory.
    auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
      if (isa<VPInterleaveBase>(R))
        return TTI::CastContextHint::Interleave;
      if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
        // Only compute CCH for memory operations, matching the legacy model
        // which only considers loads/stores for cast context hints.
        auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
        if (!isa<LoadInst, StoreInst>(UI))
          return TTI::CastContextHint::None;
        return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
                                               : TTI::CastContextHint::Normal;
      }
      const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
      if (WidenMemoryRecipe == nullptr)
        return TTI::CastContextHint::None;
      if (VF.isScalar())
        return TTI::CastContextHint::Normal;
      if (!WidenMemoryRecipe->isConsecutive())
        return TTI::CastContextHint::GatherScatter;
      if (WidenMemoryRecipe->isMasked())
        return TTI::CastContextHint::Masked;
      return TTI::CastContextHint::Normal;
    };

    VPValue *Operand = getOperand(0);
    TTI::CastContextHint CCH = TTI::CastContextHint::None;
    bool IsReverse = false;
    // For Trunc/FPTrunc, get the context from the only user.
    if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
      auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
        if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
          return nullptr;
        return dyn_cast<VPRecipeBase>(*R->user_begin());
      };
      if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
        if (match(Recipe,
                  m_CombineOr(
                      m_Reverse(m_VPValue()),
                      m_Intrinsic<Intrinsic::experimental_vp_reverse>()))) {
          Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
          IsReverse = true;
        }
        if (Recipe)
          CCH = ComputeCCH(Recipe);
      }
    }
    // For Z/Sext, get the context from the operand.
    else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
             Opcode == Instruction::FPExt) {
      if (auto *Recipe = Operand->getDefiningRecipe()) {
        VPValue *ReverseOp;
        if (match(Recipe,
                  m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
                              m_Intrinsic<Intrinsic::experimental_vp_reverse>(
                                  m_VPValue(ReverseOp))))) {
          Recipe = ReverseOp->getDefiningRecipe();
          IsReverse = true;
        }
        if (Recipe)
          CCH = ComputeCCH(Recipe);
      }
    }
    if (IsReverse && CCH != TTI::CastContextHint::None)
      CCH = TTI::CastContextHint::Reversed;

    auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
    Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
    // Arm TTI will use the underlying instruction to determine the cost.
    return Ctx.TTI.getCastInstrCost(
        Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
        dyn_cast_if_present<Instruction>(getUnderlyingValue()));
  }
  case Instruction::Select: {
    SelectInst *SI = cast_or_null<SelectInst>(getUnderlyingValue());
    bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
    Type *ScalarTy = Ctx.Types.inferScalarType(this);

    VPValue *Op0, *Op1;
    bool IsLogicalAnd =
        match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
    bool IsLogicalOr =
        match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
    // Also match the inverted forms:
    // select x, false, y --> !x & y (still AND)
    // select x, y, true --> !x | y (still OR)
    IsLogicalAnd |=
        match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
    IsLogicalOr |=
        match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));

    if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
        (IsLogicalAnd || IsLogicalOr)) {
      // select x, y, false --> x & y
      // select x, true, y --> x | y
      const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
      const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);

      SmallVector<const Value *, 2> Operands;
      if (SI && all_of(operands(),
                       [](VPValue *Op) { return Op->getUnderlyingValue(); }))
        append_range(Operands, SI->operands());
      return Ctx.TTI.getArithmeticInstrCost(
          IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
          Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
    }

    Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
    if (!IsScalarCond && VF.isVector())
      CondTy = VectorType::get(CondTy, VF);

    llvm::CmpPredicate Pred;
    if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
      if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
        if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
          Pred = Cmp->getPredicate();
    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
    return Ctx.TTI.getCmpSelInstrCost(
        Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
        {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
  }
  }
  llvm_unreachable("called for unsupported opcode");
}

InstructionCost VPInstruction::computeCost(ElementCount VF,
                                           VPCostContext &Ctx) const {
  if (Instruction::isBinaryOp(getOpcode())) {
    if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
      // TODO: Compute cost for VPInstructions without underlying values once
      // the legacy cost model has been retired.
      return 0;
    }

    assert(!doesGeneratePerAllLanes() &&
           "Should only generate a vector value or single scalar, not scalars "
           "for all lanes.");
    return getCostForRecipeWithOpcode(
        getOpcode(),
        vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
  }

  switch (getOpcode()) {
  case Instruction::Select: {
    llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
    match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
    auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
    auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
    if (!vputils::onlyFirstLaneUsed(this)) {
      CondTy = toVectorTy(CondTy, VF);
      VecTy = toVectorTy(VecTy, VF);
    }
    return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
                                      Ctx.CostKind);
  }
  case Instruction::ExtractElement:
  case VPInstruction::ExtractLane: {
    if (VF.isScalar()) {
      // ExtractLane with VF=1 takes care of handling extracting across multiple
      // parts.
      return 0;
    }

    // Add on the cost of extracting the element.
    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
    return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                      Ctx.CostKind);
  }
  case VPInstruction::AnyOf: {
    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
    return Ctx.TTI.getArithmeticReductionCost(
        Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
  }
  case VPInstruction::FirstActiveLane: {
    Type *Ty = Ctx.Types.inferScalarType(this);
    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
    if (VF.isScalar())
      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
                                        CmpInst::makeCmpResultType(ScalarTy),
                                        CmpInst::ICMP_EQ, Ctx.CostKind);
    // Calculate the cost of determining the lane index.
    auto *PredTy = toVectorTy(ScalarTy, VF);
    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
  }
  case VPInstruction::LastActiveLane: {
    Type *Ty = Ctx.Types.inferScalarType(this);
    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
    if (VF.isScalar())
      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
                                        CmpInst::makeCmpResultType(ScalarTy),
                                        CmpInst::ICMP_EQ, Ctx.CostKind);
    // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
    auto *PredTy = toVectorTy(ScalarTy, VF);
    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
    // Add cost of NOT operation on the predicate.
    Cost += Ctx.TTI.getArithmeticInstrCost(
        Instruction::Xor, PredTy, Ctx.CostKind,
        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
        {TargetTransformInfo::OK_UniformConstantValue,
         TargetTransformInfo::OP_None});
    // Add cost of SUB operation on the index.
    Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
    return Cost;
  }
  case VPInstruction::ExtractLastActive: {
    Type *ScalarTy = Ctx.Types.inferScalarType(this);
    Type *VecTy = toVectorTy(ScalarTy, VF);
    Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
    IntrinsicCostAttributes ICA(
        Intrinsic::experimental_vector_extract_last_active, ScalarTy,
        {VecTy, MaskTy, ScalarTy});
    return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
  }
  case VPInstruction::FirstOrderRecurrenceSplice: {
    assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
    return Ctx.TTI.getShuffleCost(
        TargetTransformInfo::SK_Splice, cast<VectorType>(VectorTy),
        cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
  }
  case VPInstruction::ActiveLaneMask: {
    Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
    unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
    Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
    IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
                                  {ArgTy, ArgTy});
    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
  }
  case VPInstruction::ExplicitVectorLength: {
    Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
    Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
    Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
    IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
                                  I32Ty, {Arg0Ty, I32Ty, I1Ty});
    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
  }
  case VPInstruction::Reverse: {
    assert(VF.isVector() && "Reverse operation must be vector type");
    Type *EltTy = Ctx.Types.inferScalarType(this);
    // Skip the reverse operation cost for the mask.
    // FIXME: Remove this once redundant mask reverse operations can be
    // eliminated by VPlanTransforms::cse before cost computation.
    if (EltTy->isIntegerTy(1))
      return 0;
    auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
    return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
                                  VectorTy, /*Mask=*/{}, Ctx.CostKind,
                                  /*Index=*/0);
  }
  case VPInstruction::ExtractLastLane: {
    // Add on the cost of extracting the element.
    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
    return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
                                                    VecTy, Ctx.CostKind, 0);
  }
  case VPInstruction::ExtractPenultimateElement:
    if (VF == ElementCount::getScalable(1))
      return InstructionCost::getInvalid();
    [[fallthrough]];
  default:
    // TODO: Compute cost other VPInstructions once the legacy cost model has
    // been retired.
    assert(!getUnderlyingValue() &&
           "unexpected VPInstruction witht underlying value");
    return 0;
  }
}

bool VPInstruction::isVectorToScalar() const {
  return getOpcode() == VPInstruction::ExtractLastLane ||
         getOpcode() == VPInstruction::ExtractPenultimateElement ||
         getOpcode() == Instruction::ExtractElement ||
         getOpcode() == VPInstruction::ExtractLane ||
         getOpcode() == VPInstruction::FirstActiveLane ||
         getOpcode() == VPInstruction::LastActiveLane ||
         getOpcode() == VPInstruction::ExtractLastActive ||
         getOpcode() == VPInstruction::ComputeReductionResult ||
         getOpcode() == VPInstruction::AnyOf;
}

bool VPInstruction::isSingleScalar() const {
  switch (getOpcode()) {
  case Instruction::Load:
  case Instruction::PHI:
  case VPInstruction::ExplicitVectorLength:
  case VPInstruction::ResumeForEpilogue:
  case VPInstruction::VScale:
    return true;
  default:
    return isScalarCast();
  }
}

void VPInstruction::execute(VPTransformState &State) {
  assert(!isMasked() && "cannot execute masked VPInstruction");
  assert(!State.Lane && "VPInstruction executing an Lane");
  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
  assert(flagsValidForOpcode(getOpcode()) &&
         "Set flags not supported for the provided opcode");
  assert(hasRequiredFlagsForOpcode(getOpcode()) &&
         "Opcode requires specific flags to be set");
  if (hasFastMathFlags())
    State.Builder.setFastMathFlags(getFastMathFlags());
  Value *GeneratedValue = generate(State);
  if (!hasResult())
    return;
  assert(GeneratedValue && "generate must produce a value");
  bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
                                   (vputils::onlyFirstLaneUsed(this) ||
                                    isVectorToScalar() || isSingleScalar());
  assert((((GeneratedValue->getType()->isVectorTy() ||
            GeneratedValue->getType()->isStructTy()) ==
           !GeneratesPerFirstLaneOnly) ||
          State.VF.isScalar()) &&
         "scalar value but not only first lane defined");
  State.set(this, GeneratedValue,
            /*IsScalar*/ GeneratesPerFirstLaneOnly);
  if (getOpcode() == VPInstruction::ResumeForEpilogue) {
    // FIXME: This is a workaround to enable reliable updates of the scalar loop
    // resume phis, when vectorizing the epilogue. Must be removed once epilogue
    // vectorization explicitly connects VPlans.
    setUnderlyingValue(GeneratedValue);
  }
}

bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
  if (Instruction::isBinaryOp(getOpcode()) ||
      Instruction::isUnaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
    return false;
  switch (getOpcode()) {
  case Instruction::ExtractValue:
  case Instruction::InsertValue:
  case Instruction::GetElementPtr:
  case Instruction::ExtractElement:
  case Instruction::Freeze:
  case Instruction::FCmp:
  case Instruction::ICmp:
  case Instruction::Select:
  case Instruction::PHI:
  case VPInstruction::AnyOf:
  case VPInstruction::BranchOnCond:
  case VPInstruction::BranchOnTwoConds:
  case VPInstruction::BranchOnCount:
  case VPInstruction::Broadcast:
  case VPInstruction::BuildStructVector:
  case VPInstruction::BuildVector:
  case VPInstruction::CalculateTripCountMinusVF:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::ExtractLane:
  case VPInstruction::ExtractLastLane:
  case VPInstruction::ExtractLastPart:
  case VPInstruction::ExtractPenultimateElement:
  case VPInstruction::ActiveLaneMask:
  case VPInstruction::ExitingIVValue:
  case VPInstruction::ExplicitVectorLength:
  case VPInstruction::FirstActiveLane:
  case VPInstruction::LastActiveLane:
  case VPInstruction::ExtractLastActive:
  case VPInstruction::FirstOrderRecurrenceSplice:
  case VPInstruction::LogicalAnd:
  case VPInstruction::LogicalOr:
  case VPInstruction::MaskedCond:
  case VPInstruction::Not:
  case VPInstruction::PtrAdd:
  case VPInstruction::WideIVStep:
  case VPInstruction::WidePtrAdd:
  case VPInstruction::StepVector:
  case VPInstruction::ReductionStartVector:
  case VPInstruction::Reverse:
  case VPInstruction::VScale:
  case VPInstruction::Unpack:
    return false;
  case Instruction::Call:
    return !getCalledFunction(*this)->doesNotAccessMemory();
  default:
    return true;
  }
}

bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
    return vputils::onlyFirstLaneUsed(this);

  switch (getOpcode()) {
  default:
    return false;
  case Instruction::ExtractElement:
    return Op == getOperand(1);
  case Instruction::PHI:
    return true;
  case Instruction::FCmp:
  case Instruction::ICmp:
  case Instruction::Select:
  case Instruction::Or:
  case Instruction::Freeze:
  case VPInstruction::Not:
    // TODO: Cover additional opcodes.
    return vputils::onlyFirstLaneUsed(this);
  case Instruction::Load:
  case VPInstruction::ActiveLaneMask:
  case VPInstruction::ExplicitVectorLength:
  case VPInstruction::CalculateTripCountMinusVF:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::BranchOnCount:
  case VPInstruction::BranchOnCond:
  case VPInstruction::Broadcast:
  case VPInstruction::ReductionStartVector:
    return true;
  case VPInstruction::BuildStructVector:
  case VPInstruction::BuildVector:
    // Before replicating by VF, Build(Struct)Vector uses all lanes of the
    // operand, after replicating its operands only the first lane is used.
    // Before replicating, it will have only a single operand.
    return getNumOperands() > 1;
  case VPInstruction::PtrAdd:
    return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
  case VPInstruction::WidePtrAdd:
    // WidePtrAdd supports scalar and vector base addresses.
    return false;
  case VPInstruction::ExitingIVValue:
  case VPInstruction::ExtractLane:
    return Op == getOperand(0);
  };
  llvm_unreachable("switch should return");
}

bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  if (Instruction::isBinaryOp(getOpcode()))
    return vputils::onlyFirstPartUsed(this);

  switch (getOpcode()) {
  default:
    return false;
  case Instruction::FCmp:
  case Instruction::ICmp:
  case Instruction::Select:
    return vputils::onlyFirstPartUsed(this);
  case VPInstruction::BranchOnCount:
  case VPInstruction::BranchOnCond:
  case VPInstruction::BranchOnTwoConds:
  case VPInstruction::CanonicalIVIncrementForPart:
    return true;
  };
  llvm_unreachable("switch should return");
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInstruction::dump() const {
  VPSlotTracker SlotTracker(getParent()->getPlan());
  printRecipe(dbgs(), "", SlotTracker);
}

void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";

  if (hasResult()) {
    printAsOperand(O, SlotTracker);
    O << " = ";
  }

  switch (getOpcode()) {
  case VPInstruction::Not:
    O << "not";
    break;
  case VPInstruction::ActiveLaneMask:
    O << "active lane mask";
    break;
  case VPInstruction::ExplicitVectorLength:
    O << "EXPLICIT-VECTOR-LENGTH";
    break;
  case VPInstruction::FirstOrderRecurrenceSplice:
    O << "first-order splice";
    break;
  case VPInstruction::BranchOnCond:
    O << "branch-on-cond";
    break;
  case VPInstruction::BranchOnTwoConds:
    O << "branch-on-two-conds";
    break;
  case VPInstruction::CalculateTripCountMinusVF:
    O << "TC > VF ? TC - VF : 0";
    break;
  case VPInstruction::CanonicalIVIncrementForPart:
    O << "VF * Part +";
    break;
  case VPInstruction::BranchOnCount:
    O << "branch-on-count";
    break;
  case VPInstruction::Broadcast:
    O << "broadcast";
    break;
  case VPInstruction::BuildStructVector:
    O << "buildstructvector";
    break;
  case VPInstruction::BuildVector:
    O << "buildvector";
    break;
  case VPInstruction::ExitingIVValue:
    O << "exiting-iv-value";
    break;
  case VPInstruction::MaskedCond:
    O << "masked-cond";
    break;
  case VPInstruction::ExtractLane:
    O << "extract-lane";
    break;
  case VPInstruction::ExtractLastLane:
    O << "extract-last-lane";
    break;
  case VPInstruction::ExtractLastPart:
    O << "extract-last-part";
    break;
  case VPInstruction::ExtractPenultimateElement:
    O << "extract-penultimate-element";
    break;
  case VPInstruction::ComputeReductionResult:
    O << "compute-reduction-result";
    break;
  case VPInstruction::LogicalAnd:
    O << "logical-and";
    break;
  case VPInstruction::LogicalOr:
    O << "logical-or";
    break;
  case VPInstruction::PtrAdd:
    O << "ptradd";
    break;
  case VPInstruction::WidePtrAdd:
    O << "wide-ptradd";
    break;
  case VPInstruction::AnyOf:
    O << "any-of";
    break;
  case VPInstruction::FirstActiveLane:
    O << "first-active-lane";
    break;
  case VPInstruction::LastActiveLane:
    O << "last-active-lane";
    break;
  case VPInstruction::ReductionStartVector:
    O << "reduction-start-vector";
    break;
  case VPInstruction::ResumeForEpilogue:
    O << "resume-for-epilogue";
    break;
  case VPInstruction::Reverse:
    O << "reverse";
    break;
  case VPInstruction::Unpack:
    O << "unpack";
    break;
  case VPInstruction::ExtractLastActive:
    O << "extract-last-active";
    break;
  default:
    O << Instruction::getOpcodeName(getOpcode());
  }

  printFlags(O);
  printOperands(O, SlotTracker);
}
#endif

void VPInstructionWithType::execute(VPTransformState &State) {
  State.setDebugLocFrom(getDebugLoc());
  if (isScalarCast()) {
    Value *Op = State.get(getOperand(0), VPLane(0));
    Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
                                           Op, ResultTy);
    State.set(this, Cast, VPLane(0));
    return;
  }
  switch (getOpcode()) {
  case VPInstruction::StepVector: {
    Value *StepVector =
        State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
    State.set(this, StepVector);
    break;
  }
  case VPInstruction::VScale: {
    Value *VScale = State.Builder.CreateVScale(ResultTy);
    State.set(this, VScale, true);
    break;
  }

  default:
    llvm_unreachable("opcode not implemented yet");
  }
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
                                        VPSlotTracker &SlotTracker) const {
  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
  printAsOperand(O, SlotTracker);
  O << " = ";

  switch (getOpcode()) {
  case VPInstruction::WideIVStep:
    O << "wide-iv-step ";
    printOperands(O, SlotTracker);
    break;
  case VPInstruction::StepVector:
    O << "step-vector " << *ResultTy;
    break;
  case VPInstruction::VScale:
    O << "vscale " << *ResultTy;
    break;
  case Instruction::Load:
    O << "load ";
    printOperands(O, SlotTracker);
    break;
  default:
    assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
    O << Instruction::getOpcodeName(getOpcode()) << " ";
    printOperands(O, SlotTracker);
    O << " to " << *ResultTy;
  }
}
#endif

void VPPhi::execute(VPTransformState &State) {
  State.setDebugLocFrom(getDebugLoc());
  PHINode *NewPhi = State.Builder.CreatePHI(
      State.TypeAnalysis.inferScalarType(this), 2, getName());
  unsigned NumIncoming = getNumIncoming();
  // Detect header phis: the parent block dominates its second incoming block
  // (the latch). Those IR incoming values have not been generated yet and need
  // to be added after they have been executed.
  if (NumIncoming == 2 &&
      State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
    NumIncoming = 1;
  }
  for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
    Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
    BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
    NewPhi->addIncoming(IncV, PredBB);
  }
  State.set(this, NewPhi, VPLane(0));
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
                        VPSlotTracker &SlotTracker) const {
  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
  printAsOperand(O, SlotTracker);
  O << " = phi";
  printFlags(O);
  printPhiOperands(O, SlotTracker);
}
#endif

VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
  if (auto *Phi = dyn_cast<PHINode>(&I))
    return new VPIRPhi(*Phi);
  return new VPIRInstruction(I);
}

void VPIRInstruction::execute(VPTransformState &State) {
  assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
         "PHINodes must be handled by VPIRPhi");
  // Advance the insert point after the wrapped IR instruction. This allows
  // interleaving VPIRInstructions and other recipes.
  State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
}

InstructionCost VPIRInstruction::computeCost(ElementCount VF,
                                             VPCostContext &Ctx) const {
  // The recipe wraps an existing IR instruction on the border of VPlan's scope,
  // hence it does not contribute to the cost-modeling for the VPlan.
  return 0;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
  O << Indent << "IR " << I;
}
#endif

void VPIRPhi::execute(VPTransformState &State) {
  PHINode *Phi = &getIRPhi();
  for (const auto &[Idx, Op] : enumerate(operands())) {
    VPValue *ExitValue = Op;
    auto Lane = vputils::isSingleScalar(ExitValue)
                    ? VPLane::getFirstLane()
                    : VPLane::getLastLaneForVF(State.VF);
    VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
    auto *PredVPBB = Pred->getExitingBasicBlock();
    BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
    // Set insertion point in PredBB in case an extract needs to be generated.
    // TODO: Model extracts explicitly.
    State.Builder.SetInsertPoint(PredBB->getTerminator());
    Value *V = State.get(ExitValue, VPLane(Lane));
    // If there is no existing block for PredBB in the phi, add a new incoming
    // value. Otherwise update the existing incoming value for PredBB.
    if (Phi->getBasicBlockIndex(PredBB) == -1)
      Phi->addIncoming(V, PredBB);
    else
      Phi->setIncomingValueForBlock(PredBB, V);
  }

  // Advance the insert point after the wrapped IR instruction. This allows
  // interleaving VPIRInstructions and other recipes.
  State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
}

void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
  VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
  assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
         "Number of phi operands must match number of predecessors");
  unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
  R->removeOperand(Position);
}

VPValue *
VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
  VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
  return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
}

void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
                                              VPValue *V) const {
  VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
  R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhiAccessors::printPhiOperands(raw_ostream &O,
                                      VPSlotTracker &SlotTracker) const {
  interleaveComma(enumerate(getAsRecipe()->operands()), O,
                  [this, &O, &SlotTracker](auto Op) {
                    O << "[ ";
                    Op.value()->printAsOperand(O, SlotTracker);
                    O << ", ";
                    getIncomingBlock(Op.index())->printAsOperand(O);
                    O << " ]";
                  });
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
                          VPSlotTracker &SlotTracker) const {
  VPIRInstruction::printRecipe(O, Indent, SlotTracker);

  if (getNumOperands() != 0) {
    O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
    interleaveComma(incoming_values_and_blocks(), O,
                    [&O, &SlotTracker](auto Op) {
                      std::get<0>(Op)->printAsOperand(O, SlotTracker);
                      O << " from ";
                      std::get<1>(Op)->printAsOperand(O);
                    });
    O << ")";
  }
}
#endif

void VPIRMetadata::applyMetadata(Instruction &I) const {
  for (const auto &[Kind, Node] : Metadata)
    I.setMetadata(Kind, Node);
}

void VPIRMetadata::intersect(const VPIRMetadata &Other) {
  SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
  for (const auto &[KindA, MDA] : Metadata) {
    for (const auto &[KindB, MDB] : Other.Metadata) {
      if (KindA == KindB && MDA == MDB) {
        MetadataIntersection.emplace_back(KindA, MDA);
        break;
      }
    }
  }
  Metadata = std::move(MetadataIntersection);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
  const Module *M = SlotTracker.getModule();
  if (Metadata.empty() || !M)
    return;

  ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
  O << " (";
  interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
    auto [Kind, Node] = KindNodePair;
    assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
           "Unexpected unnamed metadata kind");
    O << "!" << MDNames[Kind] << " ";
    Node->printAsOperand(O, M);
  });
  O << ")";
}
#endif

void VPWidenCallRecipe::execute(VPTransformState &State) {
  assert(State.VF.isVector() && "not widening");
  assert(Variant != nullptr && "Can't create vector function.");

  FunctionType *VFTy = Variant->getFunctionType();
  // Add return type if intrinsic is overloaded on it.
  SmallVector<Value *, 4> Args;
  for (const auto &I : enumerate(args())) {
    Value *Arg;
    // Some vectorized function variants may also take a scalar argument,
    // e.g. linear parameters for pointers. This needs to be the scalar value
    // from the start of the respective part when interleaving.
    if (!VFTy->getParamType(I.index())->isVectorTy())
      Arg = State.get(I.value(), VPLane(0));
    else
      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
    Args.push_back(Arg);
  }

  auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
  SmallVector<OperandBundleDef, 1> OpBundles;
  if (CI)
    CI->getOperandBundlesAsDefs(OpBundles);

  CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
  applyFlags(*V);
  applyMetadata(*V);
  V->setCallingConv(Variant->getCallingConv());

  if (!V->getType()->isVoidTy())
    State.set(this, V);
}

InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
                                  Variant->getFunctionType()->params(),
                                  Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-CALL ";

  Function *CalledFn = getCalledScalarFunction();
  if (CalledFn->getReturnType()->isVoidTy())
    O << "void ";
  else {
    printAsOperand(O, SlotTracker);
    O << " = ";
  }

  O << "call";
  printFlags(O);
  O << " @" << CalledFn->getName() << "(";
  interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
    Op->printAsOperand(O, SlotTracker);
  });
  O << ")";

  O << " (using library function";
  if (Variant->hasName())
    O << ": " << Variant->getName();
  O << ")";
}
#endif

void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
  assert(State.VF.isVector() && "not widening");

  SmallVector<Type *, 2> TysForDecl;
  // Add return type if intrinsic is overloaded on it.
  if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
                                             State.TTI)) {
    Type *RetTy = toVectorizedTy(getResultType(), State.VF);
    ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
    for (auto [Idx, Ty] : enumerate(ContainedTys)) {
      if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID,
                                                           Idx, State.TTI))
        TysForDecl.push_back(Ty);
    }
  }
  SmallVector<Value *, 4> Args;
  for (const auto &I : enumerate(operands())) {
    // Some intrinsics have a scalar argument - don't replace it with a
    // vector.
    Value *Arg;
    if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
                                           State.TTI))
      Arg = State.get(I.value(), VPLane(0));
    else
      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
    if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                               State.TTI))
      TysForDecl.push_back(Arg->getType());
    Args.push_back(Arg);
  }

  // Use vector version of the intrinsic.
  Module *M = State.Builder.GetInsertBlock()->getModule();
  Function *VectorF =
      Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
  assert(VectorF &&
         "Can't retrieve vector intrinsic or vector-predication intrinsics.");

  auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
  SmallVector<OperandBundleDef, 1> OpBundles;
  if (CI)
    CI->getOperandBundlesAsDefs(OpBundles);

  CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);

  applyFlags(*V);
  applyMetadata(*V);

  if (!V->getType()->isVoidTy())
    State.set(this, V);
}

/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
                                            ArrayRef<const VPValue *> Operands,
                                            const VPRecipeWithIRFlags &R,
                                            ElementCount VF,
                                            VPCostContext &Ctx) {
  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
  // Skip the reverse operation cost for the mask.
  // FIXME: Remove this once redundant mask reverse operations can be eliminated
  // by VPlanTransforms::cse before cost computation.
  if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
    return InstructionCost(0);

  // Some backends analyze intrinsic arguments to determine cost. Use the
  // underlying value for the operand if it has one. Otherwise try to use the
  // operand of the underlying call instruction, if there is one. Otherwise
  // clear Arguments.
  // TODO: Rework TTI interface to be independent of concrete IR values.
  SmallVector<const Value *> Arguments;
  for (const auto &[Idx, Op] : enumerate(Operands)) {
    auto *V = Op->getUnderlyingValue();
    if (!V) {
      if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
        Arguments.push_back(UI->getArgOperand(Idx));
        continue;
      }
      Arguments.clear();
      break;
    }
    Arguments.push_back(V);
  }

  Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
  SmallVector<Type *> ParamTys;
  for (const VPValue *Op : Operands) {
    ParamTys.push_back(VF.isVector()
                           ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
                           : Ctx.Types.inferScalarType(Op));
  }

  // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
  IntrinsicCostAttributes CostAttrs(
      ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
      dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
      InstructionCost::getInvalid());
  return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}

InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
                                                    VPCostContext &Ctx) const {
  SmallVector<const VPValue *> ArgOps(operands());
  return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
}

StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
  return Intrinsic::getBaseName(VectorIntrinsicID);
}

bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  return all_of(enumerate(operands()), [this, &Op](const auto &X) {
    auto [Idx, V] = X;
    return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
                                                         Idx, nullptr);
  });
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                         VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-INTRINSIC ";
  if (ResultTy->isVoidTy()) {
    O << "void ";
  } else {
    printAsOperand(O, SlotTracker);
    O << " = ";
  }

  O << "call";
  printFlags(O);
  O << getIntrinsicName() << "(";

  interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
    Op->printAsOperand(O, SlotTracker);
  });
  O << ")";
}
#endif

void VPHistogramRecipe::execute(VPTransformState &State) {
  IRBuilderBase &Builder = State.Builder;

  Value *Address = State.get(getOperand(0));
  Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
  VectorType *VTy = cast<VectorType>(Address->getType());

  // The histogram intrinsic requires a mask even if the recipe doesn't;
  // if the mask operand was omitted then all lanes should be executed and
  // we just need to synthesize an all-true mask.
  Value *Mask = nullptr;
  if (VPValue *VPMask = getMask())
    Mask = State.get(VPMask);
  else
    Mask =
        Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));

  // If this is a subtract, we want to invert the increment amount. We may
  // add a separate intrinsic in future, but for now we'll try this.
  if (Opcode == Instruction::Sub)
    IncAmt = Builder.CreateNeg(IncAmt);
  else
    assert(Opcode == Instruction::Add && "only add or sub supported for now");

  State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
                                {VTy, IncAmt->getType()},
                                {Address, IncAmt, Mask});
}

InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  // FIXME: Take the gather and scatter into account as well. For now we're
  //        generating the same cost as the fallback path, but we'll likely
  //        need to create a new TTI method for determining the cost, including
  //        whether we can use base + vec-of-smaller-indices or just
  //        vec-of-pointers.
  assert(VF.isVector() && "Invalid VF for histogram cost");
  Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
  VPValue *IncAmt = getOperand(1);
  Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
  VectorType *VTy = VectorType::get(IncTy, VF);

  // Assume that a non-constant update value (or a constant != 1) requires
  // a multiply, and add that into the cost.
  InstructionCost MulCost =
      Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
  if (match(IncAmt, m_One()))
    MulCost = TTI::TCC_Free;

  // Find the cost of the histogram operation itself.
  Type *PtrTy = VectorType::get(AddressTy, VF);
  Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
  IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
                              Type::getVoidTy(Ctx.LLVMCtx),
                              {PtrTy, IncTy, MaskTy});

  // Add the costs together with the add/sub operation.
  return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
         Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-HISTOGRAM buckets: ";
  getOperand(0)->printAsOperand(O, SlotTracker);

  if (Opcode == Instruction::Sub)
    O << ", dec: ";
  else {
    assert(Opcode == Instruction::Add);
    O << ", inc: ";
  }
  getOperand(1)->printAsOperand(O, SlotTracker);

  if (VPValue *Mask = getMask()) {
    O << ", mask: ";
    Mask->printAsOperand(O, SlotTracker);
  }
}
#endif

VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
  AllowReassoc = FMF.allowReassoc();
  NoNaNs = FMF.noNaNs();
  NoInfs = FMF.noInfs();
  NoSignedZeros = FMF.noSignedZeros();
  AllowReciprocal = FMF.allowReciprocal();
  AllowContract = FMF.allowContract();
  ApproxFunc = FMF.approxFunc();
}

VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
  switch (Opcode) {
  case Instruction::Add:
  case Instruction::Sub:
  case Instruction::Mul:
  case Instruction::Shl:
  case VPInstruction::CanonicalIVIncrementForPart:
    return WrapFlagsTy(false, false);
  case Instruction::Trunc:
    return TruncFlagsTy(false, false);
  case Instruction::Or:
    return DisjointFlagsTy(false);
  case Instruction::AShr:
  case Instruction::LShr:
  case Instruction::UDiv:
  case Instruction::SDiv:
    return ExactFlagsTy(false);
  case Instruction::GetElementPtr:
  case VPInstruction::PtrAdd:
  case VPInstruction::WidePtrAdd:
    return GEPNoWrapFlags::none();
  case Instruction::ZExt:
  case Instruction::UIToFP:
    return NonNegFlagsTy(false);
  case Instruction::FAdd:
  case Instruction::FSub:
  case Instruction::FMul:
  case Instruction::FDiv:
  case Instruction::FRem:
  case Instruction::FNeg:
  case Instruction::FPExt:
  case Instruction::FPTrunc:
    return FastMathFlags();
  case Instruction::ICmp:
  case Instruction::FCmp:
  case VPInstruction::ComputeReductionResult:
    llvm_unreachable("opcode requires explicit flags");
  default:
    return VPIRFlags();
  }
}

#if !defined(NDEBUG)
bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
  switch (OpType) {
  case OperationType::OverflowingBinOp:
    return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
           Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
           Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
  case OperationType::Trunc:
    return Opcode == Instruction::Trunc;
  case OperationType::DisjointOp:
    return Opcode == Instruction::Or;
  case OperationType::PossiblyExactOp:
    return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
           Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
  case OperationType::GEPOp:
    return Opcode == Instruction::GetElementPtr ||
           Opcode == VPInstruction::PtrAdd ||
           Opcode == VPInstruction::WidePtrAdd;
  case OperationType::FPMathOp:
    return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
           Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
           Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
           Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
           Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
           Opcode == Instruction::Select ||
           Opcode == VPInstruction::WideIVStep ||
           Opcode == VPInstruction::ReductionStartVector;
  case OperationType::FCmp:
    return Opcode == Instruction::FCmp;
  case OperationType::NonNegOp:
    return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
  case OperationType::Cmp:
    return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
  case OperationType::ReductionOp:
    return Opcode == VPInstruction::ComputeReductionResult;
  case OperationType::Other:
    return true;
  }
  llvm_unreachable("Unknown OperationType enum");
}

bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
  // Handle opcodes without default flags.
  if (Opcode == Instruction::ICmp)
    return OpType == OperationType::Cmp;
  if (Opcode == Instruction::FCmp)
    return OpType == OperationType::FCmp;
  if (Opcode == VPInstruction::ComputeReductionResult)
    return OpType == OperationType::ReductionOp;

  OperationType Required = getDefaultFlags(Opcode).OpType;
  return Required == OperationType::Other || Required == OpType;
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRFlags::printFlags(raw_ostream &O) const {
  switch (OpType) {
  case OperationType::Cmp:
    O << " " << CmpInst::getPredicateName(getPredicate());
    break;
  case OperationType::FCmp:
    O << " " << CmpInst::getPredicateName(getPredicate());
    getFastMathFlags().print(O);
    break;
  case OperationType::DisjointOp:
    if (DisjointFlags.IsDisjoint)
      O << " disjoint";
    break;
  case OperationType::PossiblyExactOp:
    if (ExactFlags.IsExact)
      O << " exact";
    break;
  case OperationType::OverflowingBinOp:
    if (WrapFlags.HasNUW)
      O << " nuw";
    if (WrapFlags.HasNSW)
      O << " nsw";
    break;
  case OperationType::Trunc:
    if (TruncFlags.HasNUW)
      O << " nuw";
    if (TruncFlags.HasNSW)
      O << " nsw";
    break;
  case OperationType::FPMathOp:
    getFastMathFlags().print(O);
    break;
  case OperationType::GEPOp: {
    GEPNoWrapFlags Flags = getGEPNoWrapFlags();
    if (Flags.isInBounds())
      O << " inbounds";
    else if (Flags.hasNoUnsignedSignedWrap())
      O << " nusw";
    if (Flags.hasNoUnsignedWrap())
      O << " nuw";
    break;
  }
  case OperationType::NonNegOp:
    if (NonNegFlags.NonNeg)
      O << " nneg";
    break;
  case OperationType::ReductionOp: {
    RecurKind RK = getRecurKind();
    O << " (";
    switch (RK) {
    case RecurKind::AnyOf:
      O << "any-of";
      break;
    case RecurKind::FindLast:
      O << "find-last";
      break;
    case RecurKind::SMax:
      O << "smax";
      break;
    case RecurKind::SMin:
      O << "smin";
      break;
    case RecurKind::UMax:
      O << "umax";
      break;
    case RecurKind::UMin:
      O << "umin";
      break;
    case RecurKind::FMinNum:
      O << "fminnum";
      break;
    case RecurKind::FMaxNum:
      O << "fmaxnum";
      break;
    case RecurKind::FMinimum:
      O << "fminimum";
      break;
    case RecurKind::FMaximum:
      O << "fmaximum";
      break;
    case RecurKind::FMinimumNum:
      O << "fminimumnum";
      break;
    case RecurKind::FMaximumNum:
      O << "fmaximumnum";
      break;
    default:
      O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
      break;
    }
    if (isReductionInLoop())
      O << ", in-loop";
    if (isReductionOrdered())
      O << ", ordered";
    O << ")";
    getFastMathFlags().print(O);
    break;
  }
  case OperationType::Other:
    break;
  }
  O << " ";
}
#endif

void VPWidenRecipe::execute(VPTransformState &State) {
  auto &Builder = State.Builder;
  switch (Opcode) {
  case Instruction::Call:
  case Instruction::UncondBr:
  case Instruction::CondBr:
  case Instruction::PHI:
  case Instruction::GetElementPtr:
    llvm_unreachable("This instruction is handled by a different recipe.");
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::SRem:
  case Instruction::URem:
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::FNeg:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::FDiv:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor: {
    // Just widen unops and binops.
    SmallVector<Value *, 2> Ops;
    for (VPValue *VPOp : operands())
      Ops.push_back(State.get(VPOp));

    Value *V = Builder.CreateNAryOp(Opcode, Ops);

    if (auto *VecOp = dyn_cast<Instruction>(V)) {
      applyFlags(*VecOp);
      applyMetadata(*VecOp);
    }

    // Use this vector value for all users of the original instruction.
    State.set(this, V);
    break;
  }
  case Instruction::ExtractValue: {
    assert(getNumOperands() == 2 && "expected single level extractvalue");
    Value *Op = State.get(getOperand(0));
    Value *Extract = Builder.CreateExtractValue(
        Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
    State.set(this, Extract);
    break;
  }
  case Instruction::Freeze: {
    Value *Op = State.get(getOperand(0));
    Value *Freeze = Builder.CreateFreeze(Op);
    State.set(this, Freeze);
    break;
  }
  case Instruction::ICmp:
  case Instruction::FCmp: {
    // Widen compares. Generate vector compares.
    bool FCmp = Opcode == Instruction::FCmp;
    Value *A = State.get(getOperand(0));
    Value *B = State.get(getOperand(1));
    Value *C = nullptr;
    if (FCmp) {
      C = Builder.CreateFCmp(getPredicate(), A, B);
    } else {
      C = Builder.CreateICmp(getPredicate(), A, B);
    }
    if (auto *I = dyn_cast<Instruction>(C)) {
      applyFlags(*I);
      applyMetadata(*I);
    }
    State.set(this, C);
    break;
  }
  case Instruction::Select: {
    VPValue *CondOp = getOperand(0);
    Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
    Value *Op0 = State.get(getOperand(1));
    Value *Op1 = State.get(getOperand(2));
    Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
    State.set(this, Sel);
    if (auto *I = dyn_cast<Instruction>(Sel)) {
      if (isa<FPMathOperator>(I))
        applyFlags(*I);
      applyMetadata(*I);
    }
    break;
  }
  default:
    // This instruction is not vectorized by simple widening.
    LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
                      << Instruction::getOpcodeName(Opcode));
    llvm_unreachable("Unhandled instruction!");
  } // end of switch.

#if !defined(NDEBUG)
  // Verify that VPlan type inference results agree with the type of the
  // generated values.
  assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
             State.get(this)->getType() &&
         "inferred type and type from generated instructions do not match");
#endif
}

InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                           VPCostContext &Ctx) const {
  switch (Opcode) {
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::SRem:
  case Instruction::URem:
    // If the div/rem operation isn't safe to speculate and requires
    // predication, then the only way we can even create a vplan is to insert
    // a select on the second input operand to ensure we use the value of 1
    // for the inactive lanes. The select will be costed separately.
  case Instruction::FNeg:
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::FDiv:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor:
  case Instruction::Freeze:
  case Instruction::ExtractValue:
  case Instruction::ICmp:
  case Instruction::FCmp:
  case Instruction::Select:
    return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
  default:
    llvm_unreachable("Unsupported opcode for instruction");
  }
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN ";
  printAsOperand(O, SlotTracker);
  O << " = " << Instruction::getOpcodeName(Opcode);
  printFlags(O);
  printOperands(O, SlotTracker);
}
#endif

void VPWidenCastRecipe::execute(VPTransformState &State) {
  auto &Builder = State.Builder;
  /// Vectorize casts.
  assert(State.VF.isVector() && "Not vectorizing?");
  Type *DestTy = VectorType::get(getResultType(), State.VF);
  VPValue *Op = getOperand(0);
  Value *A = State.get(Op);
  Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
  State.set(this, Cast);
  if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
    applyFlags(*CastOp);
    applyMetadata(*CastOp);
  }
}

InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  // TODO: In some cases, VPWidenCastRecipes are created but not considered in
  // the legacy cost model, including truncates/extends when evaluating a
  // reduction in a smaller type.
  if (!getUnderlyingValue())
    return 0;
  return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-CAST ";
  printAsOperand(O, SlotTracker);
  O << " = " << Instruction::getOpcodeName(Opcode);
  printFlags(O);
  printOperands(O, SlotTracker);
  O << " to " << *getResultType();
}
#endif

InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::printRecipe(
    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
  O << Indent;
  printAsOperand(O, SlotTracker);
  O << " = WIDEN-INDUCTION";
  printFlags(O);
  printOperands(O, SlotTracker);

  if (auto *TI = getTruncInst())
    O << " (truncated to " << *TI->getType() << ")";
}
#endif

bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
  // The step may be defined by a recipe in the preheader (e.g. if it requires
  // SCEV expansion), but for the canonical induction the step is required to be
  // 1, which is represented as live-in.
  return match(getStartValue(), m_ZeroInt()) &&
         match(getStepValue(), m_One()) &&
         getScalarType() == getRegion()->getCanonicalIVType();
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent;
  printAsOperand(O, SlotTracker);
  O << " = DERIVED-IV ";
  getStartValue()->printAsOperand(O, SlotTracker);
  O << " + ";
  getOperand(1)->printAsOperand(O, SlotTracker);
  O << " * ";
  getStepValue()->printAsOperand(O, SlotTracker);
}
#endif

void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
  // Fast-math-flags propagate from the original induction instruction.
  IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
  State.Builder.setFastMathFlags(getFastMathFlags());

  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
  /// variable on which to base the steps, \p Step is the size of the step.

  Value *BaseIV = State.get(getOperand(0), VPLane(0));
  Value *Step = State.get(getStepValue(), VPLane(0));
  IRBuilderBase &Builder = State.Builder;

  // Ensure step has the same type as that of scalar IV.
  Type *BaseIVTy = BaseIV->getType()->getScalarType();
  assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");

  // We build scalar steps for both integer and floating-point induction
  // variables. Here, we determine the kind of arithmetic we will perform.
  Instruction::BinaryOps AddOp;
  Instruction::BinaryOps MulOp;
  if (BaseIVTy->isIntegerTy()) {
    AddOp = Instruction::Add;
    MulOp = Instruction::Mul;
  } else {
    AddOp = InductionOpcode;
    MulOp = Instruction::FMul;
  }

  // Determine the number of scalars we need to generate.
  bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
  // Compute the scalar steps and save the results in State.

  unsigned StartLane = 0;
  unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
  if (State.Lane) {
    StartLane = State.Lane->getKnownLane();
    EndLane = StartLane + 1;
  }
  Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
                                     : Constant::getNullValue(BaseIVTy);

  for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
    // It is okay if the induction variable type cannot hold the lane number,
    // we expect truncation in this case.
    Constant *LaneValue =
        BaseIVTy->isIntegerTy()
            ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
                               /*ImplicitTrunc=*/true)
            : ConstantFP::get(BaseIVTy, Lane);
    Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
    assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
           "Expected StartIdx to be folded to a constant when VF is not "
           "scalable");
    auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
    auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
    State.set(this, Add, VPLane(Lane));
  }
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                        VPSlotTracker &SlotTracker) const {
  O << Indent;
  printAsOperand(O, SlotTracker);
  O << " = SCALAR-STEPS ";
  printOperands(O, SlotTracker);
}
#endif

bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  return vputils::isSingleScalar(Op);
}

void VPWidenGEPRecipe::execute(VPTransformState &State) {
  assert(State.VF.isVector() && "not widening");
  // Construct a vector GEP by widening the operands of the scalar GEP as
  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
  // results in a vector of pointers when at least one operand of the GEP
  // is vector-typed. Thus, to keep the representation compact, we only use
  // vector-typed operands for loop-varying values.

  bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
    return Op->isDefinedOutsideLoopRegions();
  });
  if (AllOperandsAreInvariant) {
    // If we are vectorizing, but the GEP has only loop-invariant operands,
    // the GEP we build (by only using vector-typed operands for
    // loop-varying values) would be a scalar pointer. Thus, to ensure we
    // produce a vector of pointers, we need to either arbitrarily pick an
    // operand to broadcast, or broadcast a clone of the original GEP.
    // Here, we broadcast a clone of the original.

    SmallVector<Value *> Ops;
    for (unsigned I = 0, E = getNumOperands(); I != E; I++)
      Ops.push_back(State.get(getOperand(I), VPLane(0)));

    auto *NewGEP =
        State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
                                "", getGEPNoWrapFlags());
    Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
    State.set(this, Splat);
    return;
  }

  // If the GEP has at least one loop-varying operand, we are sure to
  // produce a vector of pointers unless VF is scalar.
  // The pointer operand of the new GEP. If it's loop-invariant, we
  // won't broadcast it.
  auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());

  // Collect all the indices for the new GEP. If any index is
  // loop-invariant, we won't broadcast it.
  SmallVector<Value *, 4> Indices;
  for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
    VPValue *Operand = getOperand(I);
    Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
  }

  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
  // but it should be a vector, otherwise.
  auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
                                         "", getGEPNoWrapFlags());
  assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
         "NewGEP is not a pointer vector");
  State.set(this, NewGEP);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-GEP ";
  O << (isPointerLoopInvariant() ? "Inv" : "Var");
  for (size_t I = 0; I < getNumOperands() - 1; ++I)
    O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";

  O << " ";
  printAsOperand(O, SlotTracker);
  O << " = getelementptr";
  printFlags(O);
  printOperands(O, SlotTracker);
}
#endif

void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
  assert(!getOffset() && "Unexpected offset operand");
  VPBuilder Builder(this);
  VPlan &Plan = *getParent()->getPlan();
  VPValue *VFVal = getVFValue();
  VPTypeAnalysis TypeInfo(Plan);
  const DataLayout &DL = Plan.getDataLayout();
  Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
  VPValue *Stride =
      Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
  Type *VFTy = TypeInfo.inferScalarType(VFVal);
  VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
                                                DebugLoc::getUnknown());

  // Offset for Part0 = Offset0 = Stride * (VF - 1).
  VPInstruction *VFMinusOne =
      Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
                        DebugLoc::getUnknown(), "", {true, true});
  VPInstruction *Offset0 =
      Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});

  // Offset for PartN = Offset0 + Part * Stride * VF.
  VPValue *PartxStride =
      Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
  VPValue *Offset = Builder.createAdd(
      Offset0,
      Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
  addOperand(Offset);
}

void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
  auto &Builder = State.Builder;
  assert(getOffset() && "Expected prior materialization of offset");
  Value *Ptr = State.get(getPointer(), true);
  Value *Offset = State.get(getOffset(), true);
  Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
                                       getGEPNoWrapFlags());
  State.set(this, ResultPtr, /*IsScalar*/ true);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
  O << Indent;
  printAsOperand(O, SlotTracker);
  O << " = vector-end-pointer";
  printFlags(O);
  printOperands(O, SlotTracker);
}
#endif

void VPVectorPointerRecipe::execute(VPTransformState &State) {
  auto &Builder = State.Builder;
  assert(getOffset() &&
         "Expected prior simplification of recipe without offset");
  Value *Ptr = State.get(getOperand(0), VPLane(0));
  Value *Offset = State.get(getOffset(), true);
  Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
                                       getGEPNoWrapFlags());
  State.set(this, ResultPtr, /*IsScalar*/ true);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                        VPSlotTracker &SlotTracker) const {
  O << Indent;
  printAsOperand(O, SlotTracker);
  O << " = vector-pointer";
  printFlags(O);
  printOperands(O, SlotTracker);
}
#endif

InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
                                           VPCostContext &Ctx) const {
  // A blend will be expanded to a select VPInstruction, which will generate a
  // scalar select if only the first lane is used.
  if (vputils::onlyFirstLaneUsed(this))
    VF = ElementCount::getFixed(1);

  Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
  Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
  return (getNumIncomingValues() - 1) *
         Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
                                    CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
  O << Indent << "BLEND ";
  printAsOperand(O, SlotTracker);
  O << " =";
  printFlags(O);
  if (getNumIncomingValues() == 1) {
    // Not a User of any mask: not really blending, this is a
    // single-predecessor phi.
    getIncomingValue(0)->printAsOperand(O, SlotTracker);
  } else {
    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
      if (I != 0)
        O << " ";
      getIncomingValue(I)->printAsOperand(O, SlotTracker);
      if (I == 0 && isNormalized())
        continue;
      O << "/";
      getMask(I)->printAsOperand(O, SlotTracker);
    }
  }
}
#endif

void VPReductionRecipe::execute(VPTransformState &State) {
  assert(!State.Lane && "Reduction being replicated.");
  RecurKind Kind = getRecurrenceKind();
  assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
         "In-loop AnyOf reductions aren't currently supported");
  // Propagate the fast-math flags carried by the underlying instruction.
  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
  State.Builder.setFastMathFlags(getFastMathFlags());
  Value *NewVecOp = State.get(getVecOp());
  if (VPValue *Cond = getCondOp()) {
    Value *NewCond = State.get(Cond, State.VF.isScalar());
    VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
    Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();

    Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
    if (State.VF.isVector())
      Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);

    Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
    NewVecOp = Select;
  }
  Value *NewRed;
  Value *NextInChain;
  if (isOrdered()) {
    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
    if (State.VF.isVector())
      NewRed =
          createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
    else
      NewRed = State.Builder.CreateBinOp(
          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
          PrevInChain, NewVecOp);
    PrevInChain = NewRed;
    NextInChain = NewRed;
  } else if (isPartialReduction()) {
    assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
           "Unexpected partial reduction kind");
    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
    NewRed = State.Builder.CreateIntrinsic(
        PrevInChain->getType(),
        Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
                               : Intrinsic::vector_partial_reduce_fadd,
        {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
        "partial.reduce");
    PrevInChain = NewRed;
    NextInChain = NewRed;
  } else {
    assert(isInLoop() &&
           "The reduction must either be ordered, partial or in-loop");
    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
    NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
      NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
    else
      NextInChain = State.Builder.CreateBinOp(
          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
          PrevInChain, NewRed);
  }
  State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
}

void VPReductionEVLRecipe::execute(VPTransformState &State) {
  assert(!State.Lane && "Reduction being replicated.");

  auto &Builder = State.Builder;
  // Propagate the fast-math flags carried by the underlying instruction.
  IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
  Builder.setFastMathFlags(getFastMathFlags());

  RecurKind Kind = getRecurrenceKind();
  Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
  Value *VecOp = State.get(getVecOp());
  Value *EVL = State.get(getEVL(), VPLane(0));

  Value *Mask;
  if (VPValue *CondOp = getCondOp())
    Mask = State.get(CondOp);
  else
    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

  Value *NewRed;
  if (isOrdered()) {
    NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
  } else {
    NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
      NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
    else
      NewRed = Builder.CreateBinOp(
          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
          Prev);
  }
  State.set(this, NewRed, /*IsScalar*/ true);
}

InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  RecurKind RdxKind = getRecurrenceKind();
  Type *ElementTy = Ctx.Types.inferScalarType(this);
  auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
  unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
  FastMathFlags FMFs = getFastMathFlags();
  std::optional<FastMathFlags> OptionalFMF =
      ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;

  if (isPartialReduction()) {
    InstructionCost CondCost = 0;
    if (isConditional()) {
      CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
      auto *CondTy = cast<VectorType>(
          toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
      CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
                                            CondTy, Pred, Ctx.CostKind);
    }
    return CondCost + Ctx.TTI.getPartialReductionCost(
                          Opcode, ElementTy, ElementTy, ElementTy, VF,
                          TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
                          OptionalFMF);
  }

  // TODO: Support any-of reductions.
  assert(
      (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
       ForceTargetInstructionCost.getNumOccurrences() > 0) &&
      "Any-of reduction not implemented in VPlan-based cost model currently.");

  // Note that TTI should model the cost of moving result to the scalar register
  // and the BinOp cost in the getMinMaxReductionCost().
  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
    Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
    return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
  }

  // Note that TTI should model the cost of moving result to the scalar register
  // and the BinOp cost in the getArithmeticReductionCost().
  return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
                                            Ctx.CostKind);
}

VPExpressionRecipe::VPExpressionRecipe(
    ExpressionTypes ExpressionType,
    ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
    : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
      ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
  assert(!ExpressionRecipes.empty() && "Nothing to combine?");
  assert(
      none_of(ExpressionRecipes,
              [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
      "expression cannot contain recipes with side-effects");

  // Maintain a copy of the expression recipes as a set of users.
  SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
  for (auto *R : ExpressionRecipes)
    ExpressionRecipesAsSetOfUsers.insert(R);

  // Recipes in the expression, except the last one, must only be used by
  // (other) recipes inside the expression. If there are other users, external
  // to the expression, use a clone of the recipe for external users.
  for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
    if (R != ExpressionRecipes.back() &&
        any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
          return !ExpressionRecipesAsSetOfUsers.contains(U);
        })) {
      // There are users outside of the expression. Clone the recipe and use the
      // clone those external users.
      VPSingleDefRecipe *CopyForExtUsers = R->clone();
      R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
                                                VPUser &U, unsigned) {
        return !ExpressionRecipesAsSetOfUsers.contains(&U);
      });
      CopyForExtUsers->insertBefore(R);
    }
    if (R->getParent())
      R->removeFromParent();
  }

  // Internalize all external operands to the expression recipes. To do so,
  // create new temporary VPValues for all operands defined by a recipe outside
  // the expression. The original operands are added as operands of the
  // VPExpressionRecipe itself.
  for (auto *R : ExpressionRecipes) {
    for (const auto &[Idx, Op] : enumerate(R->operands())) {
      auto *Def = Op->getDefiningRecipe();
      if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
        continue;
      addOperand(Op);
      LiveInPlaceholders.push_back(new VPSymbolicValue());
    }
  }

  // Replace each external operand with the first one created for it in
  // LiveInPlaceholders.
  for (auto *R : ExpressionRecipes)
    for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
      R->replaceUsesOfWith(LiveIn, Tmp);
}

void VPExpressionRecipe::decompose() {
  for (auto *R : ExpressionRecipes)
    // Since the list could contain duplicates, make sure the recipe hasn't
    // already been inserted.
    if (!R->getParent())
      R->insertBefore(this);

  for (const auto &[Idx, Op] : enumerate(operands()))
    LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);

  replaceAllUsesWith(ExpressionRecipes.back());
  ExpressionRecipes.clear();
}

InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
  Type *RedTy = Ctx.Types.inferScalarType(this);
  auto *SrcVecTy = cast<VectorType>(
      toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
  unsigned Opcode = RecurrenceDescriptor::getOpcode(
      cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
  switch (ExpressionType) {
  case ExpressionTypes::ExtendedReduction: {
    unsigned Opcode = RecurrenceDescriptor::getOpcode(
        cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
    auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
    auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());

    if (RedR->isPartialReduction())
      return Ctx.TTI.getPartialReductionCost(
          Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
          TargetTransformInfo::getPartialReductionExtendKind(ExtR->getOpcode()),
          TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
          RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
                                     : std::nullopt);
    else if (!RedTy->isFloatingPointTy())
      // TTI::getExtendedReductionCost only supports integer types.
      return Ctx.TTI.getExtendedReductionCost(
          Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
          std::nullopt, Ctx.CostKind);
    else
      return InstructionCost::getInvalid();
  }
  case ExpressionTypes::MulAccReduction:
    return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
                                          Ctx.CostKind);

  case ExpressionTypes::ExtNegatedMulAccReduction:
    assert(Opcode == Instruction::Add && "Unexpected opcode");
    Opcode = Instruction::Sub;
    [[fallthrough]];
  case ExpressionTypes::ExtMulAccReduction: {
    auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
    if (RedR->isPartialReduction()) {
      auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
      auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
      auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
      return Ctx.TTI.getPartialReductionCost(
          Opcode, Ctx.Types.inferScalarType(getOperand(0)),
          Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
          TargetTransformInfo::getPartialReductionExtendKind(
              Ext0R->getOpcode()),
          TargetTransformInfo::getPartialReductionExtendKind(
              Ext1R->getOpcode()),
          Mul->getOpcode(), Ctx.CostKind,
          RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
                                     : std::nullopt);
    }
    return Ctx.TTI.getMulAccReductionCost(
        cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
            Instruction::ZExt,
        Opcode, RedTy, SrcVecTy, Ctx.CostKind);
  }
  }
  llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
}

bool VPExpressionRecipe::mayReadOrWriteMemory() const {
  return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
    return R->mayReadFromMemory() || R->mayWriteToMemory();
  });
}

bool VPExpressionRecipe::mayHaveSideEffects() const {
  assert(
      none_of(ExpressionRecipes,
              [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
      "expression cannot contain recipes with side-effects");
  return false;
}

bool VPExpressionRecipe::isSingleScalar() const {
  // Cannot use vputils::isSingleScalar(), because all external operands
  // of the expression will be live-ins while bundled.
  auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
  return RR && !RR->isPartialReduction();
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                     VPSlotTracker &SlotTracker) const {
  O << Indent << "EXPRESSION ";
  printAsOperand(O, SlotTracker);
  O << " = ";
  auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
  unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());

  switch (ExpressionType) {
  case ExpressionTypes::ExtendedReduction: {
    getOperand(1)->printAsOperand(O, SlotTracker);
    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
    O << Instruction::getOpcodeName(Opcode) << " (";
    getOperand(0)->printAsOperand(O, SlotTracker);
    Red->printFlags(O);

    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
    O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
      << *Ext0->getResultType();
    if (Red->isConditional()) {
      O << ", ";
      Red->getCondOp()->printAsOperand(O, SlotTracker);
    }
    O << ")";
    break;
  }
  case ExpressionTypes::ExtNegatedMulAccReduction: {
    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
    O << Instruction::getOpcodeName(
             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
      << " (sub (0, mul";
    auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
    Mul->printFlags(O);
    O << "(";
    getOperand(0)->printAsOperand(O, SlotTracker);
    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
    O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
      << *Ext0->getResultType() << "), (";
    getOperand(1)->printAsOperand(O, SlotTracker);
    auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
    O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
      << *Ext1->getResultType() << ")";
    if (Red->isConditional()) {
      O << ", ";
      Red->getCondOp()->printAsOperand(O, SlotTracker);
    }
    O << "))";
    break;
  }
  case ExpressionTypes::MulAccReduction:
  case ExpressionTypes::ExtMulAccReduction: {
    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
    O << Instruction::getOpcodeName(
             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
      << " (";
    O << "mul";
    bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
    auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
                                               : ExpressionRecipes[0]);
    Mul->printFlags(O);
    if (IsExtended)
      O << "(";
    getOperand(0)->printAsOperand(O, SlotTracker);
    if (IsExtended) {
      auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
      O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
        << *Ext0->getResultType() << "), (";
    } else {
      O << ", ";
    }
    getOperand(1)->printAsOperand(O, SlotTracker);
    if (IsExtended) {
      auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
      O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
        << *Ext1->getResultType() << ")";
    }
    if (Red->isConditional()) {
      O << ", ";
      Red->getCondOp()->printAsOperand(O, SlotTracker);
    }
    O << ")";
    break;
  }
  }
}

void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  if (isPartialReduction())
    O << Indent << "PARTIAL-REDUCE ";
  else
    O << Indent << "REDUCE ";
  printAsOperand(O, SlotTracker);
  O << " = ";
  getChainOp()->printAsOperand(O, SlotTracker);
  O << " +";
  printFlags(O);
  O << " reduce."
    << Instruction::getOpcodeName(
           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
    << " (";
  getVecOp()->printAsOperand(O, SlotTracker);
  if (isConditional()) {
    O << ", ";
    getCondOp()->printAsOperand(O, SlotTracker);
  }
  O << ")";
}

void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                       VPSlotTracker &SlotTracker) const {
  O << Indent << "REDUCE ";
  printAsOperand(O, SlotTracker);
  O << " = ";
  getChainOp()->printAsOperand(O, SlotTracker);
  O << " +";
  printFlags(O);
  O << " vp.reduce."
    << Instruction::getOpcodeName(
           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
    << " (";
  getVecOp()->printAsOperand(O, SlotTracker);
  O << ", ";
  getEVL()->printAsOperand(O, SlotTracker);
  if (isConditional()) {
    O << ", ";
    getCondOp()->printAsOperand(O, SlotTracker);
  }
  O << ")";
}

#endif

/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
/// operands from \p RepRecipe instead of \p Instr's operands.
static void scalarizeInstruction(const Instruction *Instr,
                                 VPReplicateRecipe *RepRecipe,
                                 const VPLane &Lane, VPTransformState &State) {
  assert((!Instr->getType()->isAggregateType() ||
          canVectorizeTy(Instr->getType())) &&
         "Expected vectorizable or non-aggregate type.");

  // Does this instruction return a value ?
  bool IsVoidRetTy = Instr->getType()->isVoidTy();

  Instruction *Cloned = Instr->clone();
  if (!IsVoidRetTy) {
    Cloned->setName(Instr->getName() + ".cloned");
    Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
    // The operands of the replicate recipe may have been narrowed, resulting in
    // a narrower result type. Update the type of the cloned instruction to the
    // correct type.
    if (ResultTy != Cloned->getType())
      Cloned->mutateType(ResultTy);
  }

  RepRecipe->applyFlags(*Cloned);
  RepRecipe->applyMetadata(*Cloned);

  if (RepRecipe->hasPredicate())
    cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());

  if (auto DL = RepRecipe->getDebugLoc())
    State.setDebugLocFrom(DL);

  // Replace the operands of the cloned instructions with their scalar
  // equivalents in the new loop.
  for (const auto &I : enumerate(RepRecipe->operands())) {
    auto InputLane = Lane;
    VPValue *Operand = I.value();
    if (vputils::isSingleScalar(Operand))
      InputLane = VPLane::getFirstLane();
    Cloned->setOperand(I.index(), State.get(Operand, InputLane));
  }

  // Place the cloned scalar in the new loop.
  State.Builder.Insert(Cloned);

  State.set(RepRecipe, Cloned, Lane);

  // If we just cloned a new assumption, add it the assumption cache.
  if (auto *II = dyn_cast<AssumeInst>(Cloned))
    State.AC->registerAssumption(II);

  assert(
      (RepRecipe->getRegion() ||
       !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
       all_of(RepRecipe->operands(),
              [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
      "Expected a recipe is either within a region or all of its operands "
      "are defined outside the vectorized region.");
}

void VPReplicateRecipe::execute(VPTransformState &State) {
  Instruction *UI = getUnderlyingInstr();

  if (!State.Lane) {
    assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
                             "must have already been unrolled");
    scalarizeInstruction(UI, this, VPLane(0), State);
    return;
  }

  assert((State.VF.isScalar() || !isSingleScalar()) &&
         "uniform recipe shouldn't be predicated");
  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
  scalarizeInstruction(UI, this, *State.Lane, State);
  // Insert scalar instance packing it into a vector.
  if (State.VF.isVector() && shouldPack()) {
    Value *WideValue =
        State.Lane->isFirstLane()
            ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
            : State.get(this);
    State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
                                                        *State.Lane));
  }
}

bool VPReplicateRecipe::shouldPack() const {
  // Find if the recipe is used by a widened recipe via an intervening
  // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
  return any_of(users(), [](const VPUser *U) {
    if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
      return !vputils::onlyScalarValuesUsed(PredR);
    return false;
  });
}

/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
/// which the legacy cost model computes a SCEV expression when computing the
/// address cost. Computing SCEVs for VPValues is incomplete and returns
/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
                                        PredicatedScalarEvolution &PSE,
                                        const Loop *L) {
  const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
  if (isa<SCEVCouldNotCompute>(Addr))
    return Addr;

  return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
}

/// Returns true if \p V is used as part of the address of another load or
/// store.
static bool isUsedByLoadStoreAddress(const VPUser *V) {
  SmallPtrSet<const VPUser *, 4> Seen;
  SmallVector<const VPUser *> WorkList = {V};

  while (!WorkList.empty()) {
    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
    if (!Cur || !Seen.insert(Cur).second)
      continue;

    auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
    // Skip blends that use V only through a compare by checking if any incoming
    // value was already visited.
    if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
                         [&](unsigned I) {
                           return Seen.contains(
                               Blend->getIncomingValue(I)->getDefiningRecipe());
                         }))
      continue;

    for (VPUser *U : Cur->users()) {
      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
        if (InterleaveR->getAddr() == Cur)
          return true;
      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
        if (RepR->getOpcode() == Instruction::Load &&
            RepR->getOperand(0) == Cur)
          return true;
        if (RepR->getOpcode() == Instruction::Store &&
            RepR->getOperand(1) == Cur)
          return true;
      }
      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
        if (MemR->getAddr() == Cur && MemR->isConsecutive())
          return true;
      }
    }

    // The legacy cost model only supports scalarization loads/stores with phi
    // addresses, if the phi is directly used as load/store address. Don't
    // traverse further for Blends.
    if (Blend)
      continue;

    append_range(WorkList, Cur->users());
  }
  return false;
}

/// Return true if \p R is a predicated load/store with a loop-invariant address
/// only masked by the header mask.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R,
                                                     const SCEV *PtrSCEV,
                                                     VPCostContext &Ctx) {
  const VPRegionBlock *ParentRegion = R.getRegion();
  if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
      !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
    return false;
  auto *BOM =
      cast<VPBranchOnMaskRecipe>(&ParentRegion->getEntryBasicBlock()->front());
  return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
}

InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                               VPCostContext &Ctx) const {
  Instruction *UI = cast<Instruction>(getUnderlyingValue());
  // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
  // transform, avoid computing their cost multiple times for now.
  Ctx.SkipCostComputation.insert(UI);

  if (VF.isScalable() && !isSingleScalar())
    return InstructionCost::getInvalid();

  switch (UI->getOpcode()) {
  case Instruction::Alloca:
    if (VF.isScalable())
      return InstructionCost::getInvalid();
    return Ctx.TTI.getArithmeticInstrCost(
        Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
  case Instruction::GetElementPtr:
    // We mark this instruction as zero-cost because the cost of GEPs in
    // vectorized code depends on whether the corresponding memory instruction
    // is scalarized or not. Therefore, we handle GEPs with the memory
    // instruction cost.
    return 0;
  case Instruction::Call: {
    auto *CalledFn =
        cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());

    SmallVector<const VPValue *> ArgOps(drop_end(operands()));
    SmallVector<Type *, 4> Tys;
    for (const VPValue *ArgOp : ArgOps)
      Tys.push_back(Ctx.Types.inferScalarType(ArgOp));

    if (CalledFn->isIntrinsic())
      // Various pseudo-intrinsics with costs of 0 are scalarized instead of
      // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
      switch (CalledFn->getIntrinsicID()) {
      case Intrinsic::assume:
      case Intrinsic::lifetime_end:
      case Intrinsic::lifetime_start:
      case Intrinsic::sideeffect:
      case Intrinsic::pseudoprobe:
      case Intrinsic::experimental_noalias_scope_decl: {
        assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
                                    ElementCount::getFixed(1), Ctx) == 0 &&
               "scalarizing intrinsic should be free");
        return InstructionCost(0);
      }
      default:
        break;
      }

    Type *ResultTy = Ctx.Types.inferScalarType(this);
    InstructionCost ScalarCallCost =
        Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
    if (isSingleScalar()) {
      if (CalledFn->isIntrinsic())
        ScalarCallCost = std::min(
            ScalarCallCost,
            getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
                                 ElementCount::getFixed(1), Ctx));
      return ScalarCallCost;
    }

    return ScalarCallCost * VF.getFixedValue() +
           Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
  }
  case Instruction::Add:
  case Instruction::Sub:
  case Instruction::FAdd:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::FDiv:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor:
  case Instruction::ICmp:
  case Instruction::FCmp:
    return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
                                      Ctx) *
           (isSingleScalar() ? 1 : VF.getFixedValue());
  case Instruction::SDiv:
  case Instruction::UDiv:
  case Instruction::SRem:
  case Instruction::URem: {
    InstructionCost ScalarCost =
        getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
    if (isSingleScalar())
      return ScalarCost;

    // If any of the operands is from a different replicate region and has its
    // cost skipped, it may have been forced to scalar. Fall back to legacy cost
    // model to avoid cost mis-match.
    if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
          auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
          if (!PredR)
            return false;
          return Ctx.skipCostComputation(
              dyn_cast_or_null<Instruction>(
                  PredR->getOperand(0)->getUnderlyingValue()),
              VF.isVector());
        }))
      break;

    ScalarCost = ScalarCost * VF.getFixedValue() +
                 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
                                              to_vector(operands()), VF);
    // If the recipe is not predicated (i.e. not in a replicate region), return
    // the scalar cost. Otherwise handle predicated cost.
    if (!getRegion()->isReplicator())
      return ScalarCost;

    // Account for the phi nodes that we will create.
    ScalarCost += VF.getFixedValue() *
                  Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
    // Scale the cost by the probability of executing the predicated blocks.
    // This assumes the predicated block for each vector lane is equally
    // likely.
    ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
    return ScalarCost;
  }
  case Instruction::Load:
  case Instruction::Store: {
    bool IsLoad = UI->getOpcode() == Instruction::Load;
    const VPValue *PtrOp = getOperand(!IsLoad);
    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
      break;

    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
    const Align Alignment = getLoadStoreAlignment(UI);
    unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
    bool UsedByLoadStoreAddress =
        !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
        UsedByLoadStoreAddress ? UI : nullptr);

    // Check if this is a predicated load/store with a loop-invariant address
    // only masked by the header mask. If so, return the uniform mem op cost.
    if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
      InstructionCost UniformCost =
          ScalarMemOpCost +
          Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
                                            /*Ptr=*/nullptr, Ctx.CostKind);
      auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
      if (IsLoad) {
        return UniformCost +
               Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
                                      VectorTy, VectorTy, {}, Ctx.CostKind);
      }

      VPValue *StoredVal = getOperand(0);
      if (!StoredVal->isDefinedOutsideLoopRegions())
        UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
            Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
      return UniformCost;
    }

    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
    InstructionCost ScalarCost =
        ScalarMemOpCost +
        Ctx.TTI.getAddressComputationCost(
            PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
            Ctx.CostKind);
    if (isSingleScalar())
      return ScalarCost;

    SmallVector<const VPValue *> OpsToScalarize;
    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
    // don't assign scalarization overhead in general, if the target prefers
    // vectorized addressing or the loaded value is used as part of an address
    // of another load or store.
    if (!UsedByLoadStoreAddress) {
      bool EfficientVectorLoadStore =
          Ctx.TTI.supportsEfficientVectorElementLoadStore();
      if (!(IsLoad && !PreferVectorizedAddressing) &&
          !(!IsLoad && EfficientVectorLoadStore))
        append_range(OpsToScalarize, operands());

      if (!EfficientVectorLoadStore)
        ResultTy = Ctx.Types.inferScalarType(this);
    }

    TTI::VectorInstrContext VIC =
        IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
    InstructionCost Cost =
        (ScalarCost * VF.getFixedValue()) +
        Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);

    const VPRegionBlock *ParentRegion = getRegion();
    if (ParentRegion && ParentRegion->isReplicator()) {
      if (!PtrSCEV)
        break;
      Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
      Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);

      auto *VecI1Ty = VectorType::get(
          IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
      Cost += Ctx.TTI.getScalarizationOverhead(
          VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
          /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);

      if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
        // Artificially setting to a high enough value to practically disable
        // vectorization with such operations.
        return 3000000;
      }
    }
    return Cost;
  }
  case Instruction::SExt:
  case Instruction::ZExt:
  case Instruction::FPToUI:
  case Instruction::FPToSI:
  case Instruction::FPExt:
  case Instruction::PtrToInt:
  case Instruction::PtrToAddr:
  case Instruction::IntToPtr:
  case Instruction::SIToFP:
  case Instruction::UIToFP:
  case Instruction::Trunc:
  case Instruction::FPTrunc:
  case Instruction::Select:
  case Instruction::AddrSpaceCast: {
    return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
                                      Ctx) *
           (isSingleScalar() ? 1 : VF.getFixedValue());
  }
  case Instruction::ExtractValue:
  case Instruction::InsertValue:
    return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
  }

  return Ctx.getLegacyCost(UI, VF);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");

  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
    printAsOperand(O, SlotTracker);
    O << " = ";
  }
  if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
    O << "call";
    printFlags(O);
    O << "@" << CB->getCalledFunction()->getName() << "(";
    interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
                    O, [&O, &SlotTracker](VPValue *Op) {
                      Op->printAsOperand(O, SlotTracker);
                    });
    O << ")";
  } else {
    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
    printFlags(O);
    printOperands(O, SlotTracker);
  }

  if (shouldPack())
    O << " (S->V)";
}
#endif

void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
  assert(State.Lane && "Branch on Mask works only on single instance.");

  VPValue *BlockInMask = getOperand(0);
  Value *ConditionBit = State.get(BlockInMask, *State.Lane);

  // Replace the temporary unreachable terminator with a new conditional branch,
  // whose two destinations will be set later when they are created.
  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
  assert(isa<UnreachableInst>(CurrentTerminator) &&
         "Expected to replace unreachable terminator with conditional branch.");
  auto CondBr =
      State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
  CondBr->setSuccessor(0, nullptr);
  CurrentTerminator->eraseFromParent();
}

InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
  // The legacy cost model doesn't assign costs to branches for individual
  // replicate regions. Match the current behavior in the VPlan cost model for
  // now.
  return 0;
}

void VPPredInstPHIRecipe::execute(VPTransformState &State) {
  assert(State.Lane && "Predicated instruction PHI works per instance.");
  Instruction *ScalarPredInst =
      cast<Instruction>(State.get(getOperand(0), *State.Lane));
  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
  assert(PredicatingBB && "Predicated block has no single predecessor.");
  assert(isa<VPReplicateRecipe>(getOperand(0)) &&
         "operand must be VPReplicateRecipe");

  // By current pack/unpack logic we need to generate only a single phi node: if
  // a vector value for the predicated instruction exists at this point it means
  // the instruction has vector users only, and a phi for the vector value is
  // needed. In this case the recipe of the predicated instruction is marked to
  // also do that packing, thereby "hoisting" the insert-element sequence.
  // Otherwise, a phi node for the scalar value is needed.
  if (State.hasVectorValue(getOperand(0))) {
    auto *VecI = cast<Instruction>(State.get(getOperand(0)));
    assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
           "Packed operands must generate an insertelement or insertvalue");

    // If VectorI is a struct, it will be a sequence like:
    // %1       = insertvalue %unmodified, %x, 0
    // %2       = insertvalue %1, %y, 1
    // %VectorI = insertvalue %2, %z, 2
    // To get the unmodified vector we need to look through the chain.
    if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
      for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
        VecI = cast<InsertValueInst>(VecI->getOperand(0));

    PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
    VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
    VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
    if (State.hasVectorValue(this))
      State.reset(this, VPhi);
    else
      State.set(this, VPhi);
    // NOTE: Currently we need to update the value of the operand, so the next
    // predicated iteration inserts its generated value in the correct vector.
    State.reset(getOperand(0), VPhi);
  } else {
    if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
      return;

    Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
                     PredicatingBB);
    Phi->addIncoming(ScalarPredInst, PredicatedBB);
    if (State.hasScalarValue(this, *State.Lane))
      State.reset(this, Phi, *State.Lane);
    else
      State.set(this, Phi, *State.Lane);
    // NOTE: Currently we need to update the value of the operand, so the next
    // predicated iteration inserts its generated value in the correct vector.
    State.reset(getOperand(0), Phi, *State.Lane);
  }
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
  O << Indent << "PHI-PREDICATED-INSTRUCTION ";
  printAsOperand(O, SlotTracker);
  O << " = ";
  printOperands(O, SlotTracker);
}
#endif

InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
                                                 VPCostContext &Ctx) const {
  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                    ->getAddressSpace();
  unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
                        ? Instruction::Load
                        : Instruction::Store;

  if (!Consecutive) {
    // TODO: Using the original IR may not be accurate.
    // Currently, ARM will use the underlying IR to calculate gather/scatter
    // instruction cost.
    [[maybe_unused]] auto IsReverseMask = [this]() {
      VPValue *Mask = getMask();
      if (!Mask)
        return false;

      if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(this))
        return match(Mask, m_Intrinsic<Intrinsic::experimental_vp_reverse>());

      return match(Mask, m_Reverse(m_VPValue()));
    };
    assert(!IsReverseMask() &&
           "Inconsecutive memory access should not have reverse order");
    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
    Type *PtrTy = Ptr->getType();

    // If the address value is uniform across all lanes, then the address can be
    // calculated with scalar type and broadcast.
    if (!vputils::isSingleScalar(getAddr()))
      PtrTy = toVectorTy(PtrTy, VF);

    unsigned IID = isa<VPWidenLoadRecipe>(this)      ? Intrinsic::masked_gather
                   : isa<VPWidenStoreRecipe>(this)   ? Intrinsic::masked_scatter
                   : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
                                                     : Intrinsic::vp_scatter;
    return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
                                             Ctx.CostKind) +
           Ctx.TTI.getMemIntrinsicInstrCost(
               MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
                                          &Ingredient),
               Ctx.CostKind);
  }

  InstructionCost Cost = 0;
  if (IsMasked) {
    unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
                                                : Intrinsic::masked_store;
    Cost += Ctx.TTI.getMemIntrinsicInstrCost(
        MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
  } else {
    TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
        isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
                                                           : getOperand(1));
    Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
                                    OpInfo, &Ingredient);
  }
  return Cost;
}

void VPWidenLoadRecipe::execute(VPTransformState &State) {
  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
  bool CreateGather = !isConsecutive();

  auto &Builder = State.Builder;
  Value *Mask = nullptr;
  if (auto *VPMask = getMask())
    Mask = State.get(VPMask);

  Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
  Value *NewLI;
  if (CreateGather) {
    NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                       "wide.masked.gather");
  } else if (Mask) {
    NewLI =
        Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
                                 PoisonValue::get(DataTy), "wide.masked.load");
  } else {
    NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
  }
  applyMetadata(*cast<Instruction>(NewLI));
  State.set(this, NewLI);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN ";
  printAsOperand(O, SlotTracker);
  O << " = load ";
  printOperands(O, SlotTracker);
}
#endif

void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
  bool CreateGather = !isConsecutive();

  auto &Builder = State.Builder;
  CallInst *NewLI;
  Value *EVL = State.get(getEVL(), VPLane(0));
  Value *Addr = State.get(getAddr(), !CreateGather);
  Value *Mask = nullptr;
  if (VPValue *VPMask = getMask())
    Mask = State.get(VPMask);
  else
    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

  if (CreateGather) {
    NewLI =
        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                nullptr, "wide.masked.gather");
  } else {
    NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
                                    {Addr, Mask, EVL}, nullptr, "vp.op.load");
  }
  NewLI->addParamAttr(
      0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
  applyMetadata(*NewLI);
  Instruction *Res = NewLI;
  State.set(this, Res);
}

InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
  if (!Consecutive || IsMasked)
    return VPWidenMemoryRecipe::computeCost(VF, Ctx);

  // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
  // here because the EVL recipes using EVL to replace the tail mask. But in the
  // legacy model, it will always calculate the cost of mask.
  // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost  when we
  // don't need to compare to the legacy cost model.
  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                    ->getAddressSpace();
  return Ctx.TTI.getMemIntrinsicInstrCost(
      MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
      Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                       VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN ";
  printAsOperand(O, SlotTracker);
  O << " = vp.load ";
  printOperands(O, SlotTracker);
}
#endif

void VPWidenStoreRecipe::execute(VPTransformState &State) {
  VPValue *StoredVPValue = getStoredValue();
  bool CreateScatter = !isConsecutive();

  auto &Builder = State.Builder;

  Value *Mask = nullptr;
  if (auto *VPMask = getMask())
    Mask = State.get(VPMask);

  Value *StoredVal = State.get(StoredVPValue);
  Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
  Instruction *NewSI = nullptr;
  if (CreateScatter)
    NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
  else if (Mask)
    NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
  else
    NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
  applyMetadata(*NewSI);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                     VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN store ";
  printOperands(O, SlotTracker);
}
#endif

void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
  VPValue *StoredValue = getStoredValue();
  bool CreateScatter = !isConsecutive();

  auto &Builder = State.Builder;

  CallInst *NewSI = nullptr;
  Value *StoredVal = State.get(StoredValue);
  Value *EVL = State.get(getEVL(), VPLane(0));
  Value *Mask = nullptr;
  if (VPValue *VPMask = getMask())
    Mask = State.get(VPMask);
  else
    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

  Value *Addr = State.get(getAddr(), !CreateScatter);
  if (CreateScatter) {
    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                    Intrinsic::vp_scatter,
                                    {StoredVal, Addr, Mask, EVL});
  } else {
    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                    Intrinsic::vp_store,
                                    {StoredVal, Addr, Mask, EVL});
  }
  NewSI->addParamAttr(
      1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
  applyMetadata(*NewSI);
}

InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
                                                   VPCostContext &Ctx) const {
  if (!Consecutive || IsMasked)
    return VPWidenMemoryRecipe::computeCost(VF, Ctx);

  // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
  // here because the EVL recipes using EVL to replace the tail mask. But in the
  // legacy model, it will always calculate the cost of mask.
  // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
  // don't need to compare to the legacy cost model.
  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                    ->getAddressSpace();
  return Ctx.TTI.getMemIntrinsicInstrCost(
      MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
      Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                        VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN vp.store ";
  printOperands(O, SlotTracker);
}
#endif

static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
                                     VectorType *DstVTy, const DataLayout &DL) {
  // Verify that V is a vector type with same number of elements as DstVTy.
  auto VF = DstVTy->getElementCount();
  auto *SrcVecTy = cast<VectorType>(V->getType());
  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
  Type *SrcElemTy = SrcVecTy->getElementType();
  Type *DstElemTy = DstVTy->getElementType();
  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
         "Vector elements must have same size");

  // Do a direct cast if element types are castable.
  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
    return Builder.CreateBitOrPointerCast(V, DstVTy);
  }
  // V cannot be directly casted to desired vector type.
  // May happen when V is a floating point vector but DstVTy is a vector of
  // pointers or vice-versa. Handle this using a two-step bitcast using an
  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
         "Only one type should be a pointer type");
  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
         "Only one type should be a floating point type");
  Type *IntTy =
      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
  auto *VecIntTy = VectorType::get(IntTy, VF);
  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
}

/// Return a vector containing interleaved elements from multiple
/// smaller input vectors.
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
                                const Twine &Name) {
  unsigned Factor = Vals.size();
  assert(Factor > 1 && "Tried to interleave invalid number of vectors");

  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
#ifndef NDEBUG
  for (Value *Val : Vals)
    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
#endif

  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
  // must use intrinsics to interleave.
  if (VecTy->isScalableTy()) {
    assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
    return Builder.CreateVectorInterleave(Vals, Name);
  }

  // Fixed length. Start by concatenating all vectors into a wide vector.
  Value *WideVec = concatenateVectors(Builder, Vals);

  // Interleave the elements into the wide vector.
  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
  return Builder.CreateShuffleVector(
      WideVec, createInterleaveMask(NumElts, Factor), Name);
}

// Try to vectorize the interleave group that \p Instr belongs to.
//
// E.g. Translate following interleaved load group (factor = 3):
//   for (i = 0; i < N; i+=3) {
//     R = Pic[i];             // Member of index 0
//     G = Pic[i+1];           // Member of index 1
//     B = Pic[i+2];           // Member of index 2
//     ... // do something to R, G, B
//   }
// To:
//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
//
// Or translate following interleaved store group (factor = 3):
//   for (i = 0; i < N; i+=3) {
//     ... do something to R, G, B
//     Pic[i]   = R;           // Member of index 0
//     Pic[i+1] = G;           // Member of index 1
//     Pic[i+2] = B;           // Member of index 2
//   }
// To:
//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
  assert(!State.Lane && "Interleave group being replicated.");
  assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
         "Masking gaps for scalable vectors is not yet supported.");
  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
  Instruction *Instr = Group->getInsertPos();

  // Prepare for the vector type of the interleaved load/store.
  Type *ScalarTy = getLoadStoreType(Instr);
  unsigned InterleaveFactor = Group->getFactor();
  auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);

  VPValue *BlockInMask = getMask();
  VPValue *Addr = getAddr();
  Value *ResAddr = State.get(Addr, VPLane(0));

  auto CreateGroupMask = [&BlockInMask, &State,
                          &InterleaveFactor](Value *MaskForGaps) -> Value * {
    if (State.VF.isScalable()) {
      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
      assert(InterleaveFactor <= 8 &&
             "Unsupported deinterleave factor for scalable vectors");
      auto *ResBlockInMask = State.get(BlockInMask);
      SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
    }

    if (!BlockInMask)
      return MaskForGaps;

    Value *ResBlockInMask = State.get(BlockInMask);
    Value *ShuffledMask = State.Builder.CreateShuffleVector(
        ResBlockInMask,
        createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
        "interleaved.mask");
    return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                   ShuffledMask, MaskForGaps)
                       : ShuffledMask;
  };

  const DataLayout &DL = Instr->getDataLayout();
  // Vectorize the interleaved load group.
  if (isa<LoadInst>(Instr)) {
    Value *MaskForGaps = nullptr;
    if (needsMaskForGaps()) {
      MaskForGaps =
          createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
      assert(MaskForGaps && "Mask for Gaps is required but it is null");
    }

    Instruction *NewLoad;
    if (BlockInMask || MaskForGaps) {
      Value *GroupMask = CreateGroupMask(MaskForGaps);
      Value *PoisonVec = PoisonValue::get(VecTy);
      NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
                                               Group->getAlign(), GroupMask,
                                               PoisonVec, "wide.masked.vec");
    } else
      NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
                                                Group->getAlign(), "wide.vec");
    applyMetadata(*NewLoad);
    // TODO: Also manage existing metadata using VPIRMetadata.
    Group->addMetadata(NewLoad);

    ArrayRef<VPRecipeValue *> VPDefs = definedValues();
    if (VecTy->isScalableTy()) {
      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
      // so must use intrinsics to deinterleave.
      assert(InterleaveFactor <= 8 &&
             "Unsupported deinterleave factor for scalable vectors");
      NewLoad = State.Builder.CreateIntrinsic(
          Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
          NewLoad->getType(), NewLoad,
          /*FMFSource=*/nullptr, "strided.vec");
    }

    auto CreateStridedVector = [&InterleaveFactor, &State,
                                &NewLoad](unsigned Index) -> Value * {
      assert(Index < InterleaveFactor && "Illegal group index");
      if (State.VF.isScalable())
        return State.Builder.CreateExtractValue(NewLoad, Index);

      // For fixed length VF, use shuffle to extract the sub-vectors from the
      // wide load.
      auto StrideMask =
          createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
      return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
                                               "strided.vec");
    };

    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
      Instruction *Member = Group->getMember(I);

      // Skip the gaps in the group.
      if (!Member)
        continue;

      Value *StridedVec = CreateStridedVector(I);

      // If this member has different type, cast the result type.
      if (Member->getType() != ScalarTy) {
        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
        StridedVec =
            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
      }

      if (Group->isReverse())
        StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");

      State.set(VPDefs[J], StridedVec);
      ++J;
    }
    return;
  }

  // The sub vector type for current instruction.
  auto *SubVT = VectorType::get(ScalarTy, State.VF);

  // Vectorize the interleaved store group.
  Value *MaskForGaps =
      createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
  assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
         "Mismatch between NeedsMaskForGaps and MaskForGaps");
  ArrayRef<VPValue *> StoredValues = getStoredValues();
  // Collect the stored vector from each member.
  SmallVector<Value *, 4> StoredVecs;
  unsigned StoredIdx = 0;
  for (unsigned i = 0; i < InterleaveFactor; i++) {
    assert((Group->getMember(i) || MaskForGaps) &&
           "Fail to get a member from an interleaved store group");
    Instruction *Member = Group->getMember(i);

    // Skip the gaps in the group.
    if (!Member) {
      Value *Undef = PoisonValue::get(SubVT);
      StoredVecs.push_back(Undef);
      continue;
    }

    Value *StoredVec = State.get(StoredValues[StoredIdx]);
    ++StoredIdx;

    if (Group->isReverse())
      StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");

    // If this member has different type, cast it to a unified type.

    if (StoredVec->getType() != SubVT)
      StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);

    StoredVecs.push_back(StoredVec);
  }

  // Interleave all the smaller vectors into one wider vector.
  Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
  Instruction *NewStoreInstr;
  if (BlockInMask || MaskForGaps) {
    Value *GroupMask = CreateGroupMask(MaskForGaps);
    NewStoreInstr = State.Builder.CreateMaskedStore(
        IVec, ResAddr, Group->getAlign(), GroupMask);
  } else
    NewStoreInstr =
        State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());

  applyMetadata(*NewStoreInstr);
  // TODO: Also manage existing metadata using VPIRMetadata.
  Group->addMetadata(NewStoreInstr);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                     VPSlotTracker &SlotTracker) const {
  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
  IG->getInsertPos()->printAsOperand(O, false);
  O << ", ";
  getAddr()->printAsOperand(O, SlotTracker);
  VPValue *Mask = getMask();
  if (Mask) {
    O << ", ";
    Mask->printAsOperand(O, SlotTracker);
  }

  unsigned OpIdx = 0;
  for (unsigned i = 0; i < IG->getFactor(); ++i) {
    if (!IG->getMember(i))
      continue;
    if (getNumStoreOperands() > 0) {
      O << "\n" << Indent << "  store ";
      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
      O << " to index " << i;
    } else {
      O << "\n" << Indent << "  ";
      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
      O << " = load from index " << i;
    }
    ++OpIdx;
  }
}
#endif

void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
  assert(!State.Lane && "Interleave group being replicated.");
  assert(State.VF.isScalable() &&
         "Only support scalable VF for EVL tail-folding.");
  assert(!needsMaskForGaps() &&
         "Masking gaps for scalable vectors is not yet supported.");
  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
  Instruction *Instr = Group->getInsertPos();

  // Prepare for the vector type of the interleaved load/store.
  Type *ScalarTy = getLoadStoreType(Instr);
  unsigned InterleaveFactor = Group->getFactor();
  assert(InterleaveFactor <= 8 &&
         "Unsupported deinterleave/interleave factor for scalable vectors");
  ElementCount WideVF = State.VF * InterleaveFactor;
  auto *VecTy = VectorType::get(ScalarTy, WideVF);

  VPValue *Addr = getAddr();
  Value *ResAddr = State.get(Addr, VPLane(0));
  Value *EVL = State.get(getEVL(), VPLane(0));
  Value *InterleaveEVL = State.Builder.CreateMul(
      EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
      /* NUW= */ true, /* NSW= */ true);
  LLVMContext &Ctx = State.Builder.getContext();

  Value *GroupMask = nullptr;
  if (VPValue *BlockInMask = getMask()) {
    SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
    GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
  } else {
    GroupMask =
        State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
  }

  // Vectorize the interleaved load group.
  if (isa<LoadInst>(Instr)) {
    CallInst *NewLoad = State.Builder.CreateIntrinsic(
        VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
        "wide.vp.load");
    NewLoad->addParamAttr(0,
                          Attribute::getWithAlignment(Ctx, Group->getAlign()));

    applyMetadata(*NewLoad);
    // TODO: Also manage existing metadata using VPIRMetadata.
    Group->addMetadata(NewLoad);

    // Scalable vectors cannot use arbitrary shufflevectors (only splats),
    // so must use intrinsics to deinterleave.
    NewLoad = State.Builder.CreateIntrinsic(
        Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
        NewLoad->getType(), NewLoad,
        /*FMFSource=*/nullptr, "strided.vec");

    const DataLayout &DL = Instr->getDataLayout();
    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
      Instruction *Member = Group->getMember(I);
      // Skip the gaps in the group.
      if (!Member)
        continue;

      Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
      // If this member has different type, cast the result type.
      if (Member->getType() != ScalarTy) {
        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
        StridedVec =
            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
      }

      State.set(getVPValue(J), StridedVec);
      ++J;
    }
    return;
  } // End for interleaved load.

  // The sub vector type for current instruction.
  auto *SubVT = VectorType::get(ScalarTy, State.VF);
  // Vectorize the interleaved store group.
  ArrayRef<VPValue *> StoredValues = getStoredValues();
  // Collect the stored vector from each member.
  SmallVector<Value *, 4> StoredVecs;
  const DataLayout &DL = Instr->getDataLayout();
  for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
    Instruction *Member = Group->getMember(I);
    // Skip the gaps in the group.
    if (!Member) {
      StoredVecs.push_back(PoisonValue::get(SubVT));
      continue;
    }

    Value *StoredVec = State.get(StoredValues[StoredIdx]);
    // If this member has different type, cast it to a unified type.
    if (StoredVec->getType() != SubVT)
      StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);

    StoredVecs.push_back(StoredVec);
    ++StoredIdx;
  }

  // Interleave all the smaller vectors into one wider vector.
  Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
  CallInst *NewStore =
      State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
                                    {IVec, ResAddr, GroupMask, InterleaveEVL});
  NewStore->addParamAttr(1,
                         Attribute::getWithAlignment(Ctx, Group->getAlign()));

  applyMetadata(*NewStore);
  // TODO: Also manage existing metadata using VPIRMetadata.
  Group->addMetadata(NewStore);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                        VPSlotTracker &SlotTracker) const {
  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
  IG->getInsertPos()->printAsOperand(O, false);
  O << ", ";
  getAddr()->printAsOperand(O, SlotTracker);
  O << ", ";
  getEVL()->printAsOperand(O, SlotTracker);
  if (VPValue *Mask = getMask()) {
    O << ", ";
    Mask->printAsOperand(O, SlotTracker);
  }

  unsigned OpIdx = 0;
  for (unsigned i = 0; i < IG->getFactor(); ++i) {
    if (!IG->getMember(i))
      continue;
    if (getNumStoreOperands() > 0) {
      O << "\n" << Indent << "  vp.store ";
      getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
      O << " to index " << i;
    } else {
      O << "\n" << Indent << "  ";
      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
      O << " = vp.load from index " << i;
    }
    ++OpIdx;
  }
}
#endif

InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
                                              VPCostContext &Ctx) const {
  Instruction *InsertPos = getInsertPos();
  // Find the VPValue index of the interleave group. We need to skip gaps.
  unsigned InsertPosIdx = 0;
  for (unsigned Idx = 0; IG->getFactor(); ++Idx)
    if (auto *Member = IG->getMember(Idx)) {
      if (Member == InsertPos)
        break;
      InsertPosIdx++;
    }
  Type *ValTy = Ctx.Types.inferScalarType(
      getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
                                : getStoredValues()[InsertPosIdx]);
  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                    ->getAddressSpace();

  unsigned InterleaveFactor = IG->getFactor();
  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);

  // Holds the indices of existing members in the interleaved group.
  SmallVector<unsigned, 4> Indices;
  for (unsigned IF = 0; IF < InterleaveFactor; IF++)
    if (IG->getMember(IF))
      Indices.push_back(IF);

  // Calculate the cost of the whole interleaved group.
  InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
      InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
      IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);

  if (!IG->isReverse())
    return Cost;

  return Cost + IG->getNumMembers() *
                    Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                           VectorTy, VectorTy, {}, Ctx.CostKind,
                                           0);
}

bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
  return vputils::onlyScalarValuesUsed(this) &&
         (!IsScalable || vputils::onlyFirstLaneUsed(this));
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPointerInductionRecipe::printRecipe(
    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
  assert((getNumOperands() == 3 || getNumOperands() == 5) &&
         "unexpected number of operands");
  O << Indent << "EMIT ";
  printAsOperand(O, SlotTracker);
  O << " = WIDEN-POINTER-INDUCTION ";
  getStartValue()->printAsOperand(O, SlotTracker);
  O << ", ";
  getStepValue()->printAsOperand(O, SlotTracker);
  O << ", ";
  getOperand(2)->printAsOperand(O, SlotTracker);
  if (getNumOperands() == 5) {
    O << ", ";
    getOperand(3)->printAsOperand(O, SlotTracker);
    O << ", ";
    getOperand(4)->printAsOperand(O, SlotTracker);
  }
}

void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                     VPSlotTracker &SlotTracker) const {
  O << Indent << "EMIT ";
  printAsOperand(O, SlotTracker);
  O << " = EXPAND SCEV " << *Expr;
}
#endif

void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
  Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
  Type *STy = CanonicalIV->getType();
  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  ElementCount VF = State.VF;
  Value *VStart = VF.isScalar()
                      ? CanonicalIV
                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
  Value *VStep = Builder.CreateElementCount(
      STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
  if (VF.isVector()) {
    VStep = Builder.CreateVectorSplat(VF, VStep);
    VStep =
        Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
  }
  Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
  State.set(this, CanonicalVectorIV);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
  O << Indent << "EMIT ";
  printAsOperand(O, SlotTracker);
  O << " = WIDEN-CANONICAL-INDUCTION ";
  printOperands(O, SlotTracker);
}
#endif

void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
  auto &Builder = State.Builder;
  // Create a vector from the initial value.
  auto *VectorInit = getStartValue()->getLiveInIRValue();

  Type *VecTy = State.VF.isScalar()
                    ? VectorInit->getType()
                    : VectorType::get(VectorInit->getType(), State.VF);

  BasicBlock *VectorPH =
      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
  if (State.VF.isVector()) {
    auto *IdxTy = Builder.getInt32Ty();
    auto *One = ConstantInt::get(IdxTy, 1);
    IRBuilder<>::InsertPointGuard Guard(Builder);
    Builder.SetInsertPoint(VectorPH->getTerminator());
    auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
    VectorInit = Builder.CreateInsertElement(
        PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
  }

  // Create a phi node for the new recurrence.
  PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
  Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
  Phi->addIncoming(VectorInit, VectorPH);
  State.set(this, Phi);
}

InstructionCost
VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
                                             VPCostContext &Ctx) const {
  if (VF.isScalar())
    return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);

  return 0;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPFirstOrderRecurrencePHIRecipe::printRecipe(
    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
  O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
  printAsOperand(O, SlotTracker);
  O << " = phi ";
  printOperands(O, SlotTracker);
}
#endif

void VPReductionPHIRecipe::execute(VPTransformState &State) {
  // Reductions do not have to start at zero. They can start with
  // any loop invariant values.
  VPValue *StartVPV = getStartValue();

  // In order to support recurrences we need to be able to vectorize Phi nodes.
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
  // this value when we vectorize all of the instructions that use the PHI.
  BasicBlock *VectorPH =
      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
  bool ScalarPHI = State.VF.isScalar() || isInLoop();
  Value *StartV = State.get(StartVPV, ScalarPHI);
  Type *VecTy = StartV->getType();

  BasicBlock *HeaderBB = State.CFG.PrevBB;
  assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
         "recipe must be in the vector loop header");
  auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
  Phi->insertBefore(HeaderBB->getFirstInsertionPt());
  State.set(this, Phi, isInLoop());

  Phi->addIncoming(StartV, VectorPH);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                       VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-REDUCTION-PHI ";

  printAsOperand(O, SlotTracker);
  O << " = phi";
  printFlags(O);
  printOperands(O, SlotTracker);
  if (getVFScaleFactor() > 1)
    O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
}
#endif

bool VPBlendRecipe::usesFirstLaneOnly(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  return vputils::onlyFirstLaneUsed(this);
}

void VPWidenPHIRecipe::execute(VPTransformState &State) {
  Value *Op0 = State.get(getOperand(0));
  Type *VecTy = Op0->getType();
  Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
  State.set(this, VecPhi);
}

InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
                                              VPCostContext &Ctx) const {
  return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
  O << Indent << "WIDEN-PHI ";

  printAsOperand(O, SlotTracker);
  O << " = phi ";
  printPhiOperands(O, SlotTracker);
}
#endif

void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
  BasicBlock *VectorPH =
      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
  Value *StartMask = State.get(getOperand(0));
  PHINode *Phi =
      State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
  Phi->addIncoming(StartMask, VectorPH);
  State.set(this, Phi);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
                                            VPSlotTracker &SlotTracker) const {
  O << Indent << "ACTIVE-LANE-MASK-PHI ";

  printAsOperand(O, SlotTracker);
  O << " = phi ";
  printOperands(O, SlotTracker);
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPCurrentIterationPHIRecipe::printRecipe(
    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
  O << Indent << "CURRENT-ITERATION-PHI ";

  printAsOperand(O, SlotTracker);
  O << " = phi ";
  printOperands(O, SlotTracker);
}
#endif