//===- VPlanUtils.cpp - VPlan-related utilities ---------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "VPlanUtils.h"
#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"

using namespace llvm;
using namespace llvm::VPlanPatternMatch;
using namespace llvm::SCEVPatternMatch;

bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
}

bool vputils::onlyFirstPartUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
}

bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
  return all_of(Def->users(),
                [Def](const VPUser *U) { return U->usesScalars(Def); });
}

VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
  if (auto *E = dyn_cast<SCEVConstant>(Expr))
    return Plan.getOrAddLiveIn(E->getValue());
  // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
  // value. Otherwise the value may be defined in a loop and using it directly
  // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
  // form.
  auto *U = dyn_cast<SCEVUnknown>(Expr);
  if (U && !isa<Instruction>(U->getValue()))
    return Plan.getOrAddLiveIn(U->getValue());
  auto *Expanded = new VPExpandSCEVRecipe(Expr);
  Plan.getEntry()->appendRecipe(Expanded);
  return Expanded;
}

bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
  if (isa<VPActiveLaneMaskPHIRecipe>(V))
    return true;

  auto IsWideCanonicalIV = [](VPValue *A) {
    return isa<VPWidenCanonicalIVRecipe>(A) ||
           (isa<VPWidenIntOrFpInductionRecipe>(A) &&
            cast<VPWidenIntOrFpInductionRecipe>(A)->isCanonical());
  };

  VPValue *A, *B;

  auto m_CanonicalScalarIVSteps = m_ScalarIVSteps(
      m_CombineOr(m_CanonicalIV(),
                  m_DerivedIV(m_ZeroInt(), m_CanonicalIV(), m_One())),
      m_One(), m_Specific(&Plan.getVF()));

  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
    return B == Plan.getTripCount() &&
           (match(A, m_CanonicalScalarIVSteps) || IsWideCanonicalIV(A));

  // For scalar plans, the header mask uses the scalar steps.
  if (match(V, m_ICmp(m_CanonicalScalarIVSteps,
                      m_Specific(Plan.getBackedgeTakenCount())))) {
    assert(Plan.hasScalarVFOnly() &&
           "Non-scalar VF using scalar IV steps for header mask?");
    return true;
  }

  return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
         B == Plan.getBackedgeTakenCount();
}

/// Returns true if \p R propagates poison from any operand to its result.
static bool propagatesPoisonFromRecipeOp(const VPRecipeBase *R) {
  return TypeSwitch<const VPRecipeBase *, bool>(R)
      .Case<VPWidenGEPRecipe, VPWidenCastRecipe>(
          [](const VPRecipeBase *) { return true; })
      .Case([](const VPReplicateRecipe *Rep) {
        // GEP and casts propagate poison from all operands.
        unsigned Opcode = Rep->getOpcode();
        return Opcode == Instruction::GetElementPtr ||
               Instruction::isCast(Opcode);
      })
      .Default([](const VPRecipeBase *) { return false; });
}

/// Returns true if \p V being poison is guaranteed to trigger UB because it
/// propagates to the address of a memory recipe.
static bool poisonGuaranteesUB(const VPValue *V) {
  SmallPtrSet<const VPValue *, 8> Visited;
  SmallVector<const VPValue *, 16> Worklist;

  Worklist.push_back(V);

  while (!Worklist.empty()) {
    const VPValue *Current = Worklist.pop_back_val();
    if (!Visited.insert(Current).second)
      continue;

    for (VPUser *U : Current->users()) {
      // Check if Current is used as an address operand for load/store.
      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
        if (MemR->getAddr() == Current)
          return true;
        continue;
      }
      if (auto *Rep = dyn_cast<VPReplicateRecipe>(U)) {
        unsigned Opcode = Rep->getOpcode();
        if ((Opcode == Instruction::Load && Rep->getOperand(0) == Current) ||
            (Opcode == Instruction::Store && Rep->getOperand(1) == Current))
          return true;
      }

      // Check if poison propagates through this recipe to any of its users.
      auto *R = cast<VPRecipeBase>(U);
      for (const VPValue *Op : R->operands()) {
        if (Op == Current && propagatesPoisonFromRecipeOp(R)) {
          Worklist.push_back(R->getVPSingleValue());
          break;
        }
      }
    }
  }

  return false;
}

const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
                                           PredicatedScalarEvolution &PSE,
                                           const Loop *L) {
  ScalarEvolution &SE = *PSE.getSE();
  if (isa<VPIRValue, VPSymbolicValue>(V)) {
    Value *LiveIn = V->getUnderlyingValue();
    if (LiveIn && SE.isSCEVable(LiveIn->getType()))
      return SE.getSCEV(LiveIn);
    return SE.getCouldNotCompute();
  }

  if (auto *RV = dyn_cast<VPRegionValue>(V)) {
    assert(RV == RV->getDefiningRegion()->getCanonicalIV() &&
           "RegionValue must be canonical IV");
    if (!L)
      return SE.getCouldNotCompute();
    return SE.getAddRecExpr(SE.getZero(RV->getType()), SE.getOne(RV->getType()),
                            L, SCEV::FlagAnyWrap);
  }

  // Helper to create SCEVs for binary and unary operations.
  auto CreateSCEV = [&](ArrayRef<VPValue *> Ops,
                        function_ref<const SCEV *(ArrayRef<SCEVUse>)> CreateFn)
      -> const SCEV * {
    SmallVector<SCEVUse, 2> SCEVOps;
    for (VPValue *Op : Ops) {
      const SCEV *S = getSCEVExprForVPValue(Op, PSE, L);
      if (isa<SCEVCouldNotCompute>(S))
        return SE.getCouldNotCompute();
      SCEVOps.push_back(S);
    }
    return CreateFn(SCEVOps);
  };

  VPValue *LHSVal, *RHSVal;
  if (match(V, m_Add(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getAddExpr(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V, m_Sub(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getMinusSCEV(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V, m_Not(m_VPValue(LHSVal)))) {
    // not X = xor X, -1 = -1 - X
    return CreateSCEV({LHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getMinusSCEV(SE.getMinusOne(Ops[0]->getType()), Ops[0]);
    });
  }
  if (match(V, m_Mul(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getMulExpr(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
    });
  if (match(V,
            m_Binary<Instruction::UDiv>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getUDivExpr(Ops[0], Ops[1]);
    });
  // Handle AND with constant mask: x & (2^n - 1) can be represented as x % 2^n.
  const APInt *Mask;
  if (match(V, m_c_BinaryAnd(m_VPValue(LHSVal), m_APInt(Mask))) &&
      (*Mask + 1).isPowerOf2())
    return CreateSCEV({LHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getURemExpr(Ops[0], SE.getConstant(*Mask + 1));
    });
  if (match(V, m_Trunc(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
    return CreateSCEV({LHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getTruncateExpr(Ops[0], DestTy);
    });
  }
  if (match(V, m_ZExt(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
    return CreateSCEV({LHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getZeroExtendExpr(Ops[0], DestTy);
    });
  }
  if (match(V, m_SExt(m_VPValue(LHSVal)))) {
    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);

    // Mirror SCEV's createSCEV handling for sext(sub nsw): push sign extension
    // onto the operands before computing the subtraction.
    VPValue *SubLHS, *SubRHS;
    auto *SubR = dyn_cast<VPRecipeWithIRFlags>(LHSVal);
    if (match(LHSVal, m_Sub(m_VPValue(SubLHS), m_VPValue(SubRHS))) && SubR &&
        SubR->hasNoSignedWrap() && poisonGuaranteesUB(LHSVal)) {
      const SCEV *V1 = getSCEVExprForVPValue(SubLHS, PSE, L);
      const SCEV *V2 = getSCEVExprForVPValue(SubRHS, PSE, L);
      if (!isa<SCEVCouldNotCompute>(V1) && !isa<SCEVCouldNotCompute>(V2))
        return SE.getMinusSCEV(SE.getSignExtendExpr(V1, DestTy),
                               SE.getSignExtendExpr(V2, DestTy), SCEV::FlagNSW);
    }

    return CreateSCEV({LHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getSignExtendExpr(Ops[0], DestTy);
    });
  }
  if (match(V,
            m_Intrinsic<Intrinsic::umax>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getUMaxExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::smax>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getSMaxExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::umin>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getUMinExpr(Ops[0], Ops[1]);
    });
  if (match(V,
            m_Intrinsic<Intrinsic::smin>(m_VPValue(LHSVal), m_VPValue(RHSVal))))
    return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getSMinExpr(Ops[0], Ops[1]);
    });

  ArrayRef<VPValue *> Ops;
  Type *SourceElementType;
  if (match(V, m_GetElementPtr(SourceElementType, Ops))) {
    const SCEV *GEPExpr = CreateSCEV(Ops, [&](ArrayRef<SCEVUse> Ops) {
      return SE.getGEPExpr(Ops.front(), Ops.drop_front(), SourceElementType);
    });
    return PSE.getPredicatedSCEV(GEPExpr);
  }

  // TODO: Support constructing SCEVs for more recipes as needed.
  const VPRecipeBase *DefR = V->getDefiningRecipe();
  const SCEV *Expr =
      TypeSwitch<const VPRecipeBase *, const SCEV *>(DefR)
          .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
          .Case([&SE, &PSE, L](const VPWidenIntOrFpInductionRecipe *R) {
            const SCEV *Step = getSCEVExprForVPValue(R->getStepValue(), PSE, L);
            if (!L || isa<SCEVCouldNotCompute>(Step))
              return SE.getCouldNotCompute();
            const SCEV *Start =
                getSCEVExprForVPValue(R->getStartValue(), PSE, L);
            const SCEV *AddRec =
                SE.getAddRecExpr(Start, Step, L, SCEV::FlagAnyWrap);
            if (R->getTruncInst())
              return SE.getTruncateExpr(AddRec, R->getScalarType());
            return AddRec;
          })
          .Case([&SE, &PSE, L](const VPWidenPointerInductionRecipe *R) {
            const SCEV *Start =
                getSCEVExprForVPValue(R->getStartValue(), PSE, L);
            if (!L || isa<SCEVCouldNotCompute>(Start))
              return SE.getCouldNotCompute();
            const SCEV *Step = getSCEVExprForVPValue(R->getStepValue(), PSE, L);
            if (isa<SCEVCouldNotCompute>(Step))
              return SE.getCouldNotCompute();
            return SE.getAddRecExpr(Start, Step, L, SCEV::FlagAnyWrap);
          })
          .Case([&SE, &PSE, L](const VPDerivedIVRecipe *R) {
            const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), PSE, L);
            const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), PSE, L);
            const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), PSE, L);
            if (any_of(ArrayRef({Start, IV, Scale}),
                       IsaPred<SCEVCouldNotCompute>))
              return SE.getCouldNotCompute();

            return SE.getAddExpr(
                SE.getTruncateOrSignExtend(Start, IV->getType()),
                SE.getMulExpr(
                    IV, SE.getTruncateOrSignExtend(Scale, IV->getType())));
          })
          .Case([&SE, &PSE, L](const VPScalarIVStepsRecipe *R) {
            const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), PSE, L);
            const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), PSE, L);
            if (isa<SCEVCouldNotCompute>(IV) || !isa<SCEVConstant>(Step))
              return SE.getCouldNotCompute();
            return SE.getTruncateOrSignExtend(IV, Step->getType());
          })
          .Default(
              [&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });

  return PSE.getPredicatedSCEV(Expr);
}

bool vputils::isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE,
                                   const Loop *L) {
  // If address is an SCEVAddExpr, we require that all operands must be either
  // be invariant or a (possibly sign-extend) affine AddRec.
  if (auto *PtrAdd = dyn_cast<SCEVAddExpr>(Addr)) {
    return all_of(PtrAdd->operands(), [&SE, L](const SCEV *Op) {
      return SE.isLoopInvariant(Op, L) ||
             match(Op, m_scev_SExt(m_scev_AffineAddRec(m_SCEV(), m_SCEV()))) ||
             match(Op, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
    });
  }

  // Otherwise, check if address is loop invariant or an affine add recurrence.
  return SE.isLoopInvariant(Addr, L) ||
         match(Addr, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
}

/// Returns true if \p Opcode preserves uniformity, i.e., if all operands are
/// uniform, the result will also be uniform.
static bool preservesUniformity(unsigned Opcode) {
  if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
    return true;
  switch (Opcode) {
  case Instruction::Freeze:
  case Instruction::GetElementPtr:
  case Instruction::ICmp:
  case Instruction::FCmp:
  case Instruction::Select:
  case VPInstruction::Not:
  case VPInstruction::Broadcast:
  case VPInstruction::MaskedCond:
  case VPInstruction::PtrAdd:
    return true;
  default:
    return false;
  }
}

bool vputils::isSingleScalar(const VPValue *VPV) {
  // Live-in, symbolic and region-values represent single-scalar values.
  if (isa<VPIRValue, VPSymbolicValue, VPRegionValue>(VPV))
    return true;

  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
    const VPRegionBlock *RegionOfR = Rep->getRegion();
    // Don't consider recipes in replicate regions as uniform yet; their first
    // lane cannot be accessed when executing the replicate region for other
    // lanes.
    if (RegionOfR && RegionOfR->isReplicator())
      return false;
    return Rep->isSingleScalar() || (preservesUniformity(Rep->getOpcode()) &&
                                     all_of(Rep->operands(), isSingleScalar));
  }
  if (isa<VPWidenGEPRecipe, VPBlendRecipe>(VPV))
    return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
  if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
    return preservesUniformity(WidenR->getOpcode()) &&
           all_of(WidenR->operands(), isSingleScalar);
  }
  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
    return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
           (preservesUniformity(VPI->getOpcode()) &&
            all_of(VPI->operands(), isSingleScalar));
  if (auto *RR = dyn_cast<VPReductionRecipe>(VPV))
    return !RR->isPartialReduction();
  if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe, VPDerivedIVRecipe>(
          VPV))
    return true;
  if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
    return Expr->isSingleScalar();

  // VPExpandSCEVRecipes must be placed in the entry and are always uniform.
  return isa<VPExpandSCEVRecipe>(VPV);
}

bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
  // Live-ins and region values are uniform.
  if (isa<VPIRValue, VPSymbolicValue, VPRegionValue>(V))
    return true;

  VPRecipeBase *R = V->getDefiningRecipe();
  VPBasicBlock *VPBB = R ? R->getParent() : nullptr;
  VPlan *Plan = VPBB ? VPBB->getPlan() : nullptr;
  if (VPBB) {
    if ((VPBB == Plan->getVectorPreheader() || VPBB == Plan->getEntry())) {
      if (match(V->getDefiningRecipe(),
                m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
        return false;
      return all_of(R->operands(), isUniformAcrossVFsAndUFs);
    }
  }

  return TypeSwitch<const VPRecipeBase *, bool>(R)
      .Case([](const VPDerivedIVRecipe *R) { return true; })
      .Case([](const VPReplicateRecipe *R) {
        // Be conservative about side-effects, except for the
        // known-side-effecting assumes and stores, which we know will be
        // uniform.
        return R->isSingleScalar() &&
               (!R->mayHaveSideEffects() ||
                isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) &&
               all_of(R->operands(), isUniformAcrossVFsAndUFs);
      })
      .Case([](const VPWidenRecipe *R) {
        return preservesUniformity(R->getOpcode()) &&
               all_of(R->operands(), isUniformAcrossVFsAndUFs);
      })
      .Case([](const VPInstruction *VPI) {
        return preservesUniformity(VPI->getOpcode()) &&
               all_of(VPI->operands(), isUniformAcrossVFsAndUFs);
      })
      .Case([](const VPWidenCastRecipe *R) {
        // A cast is uniform according to its operand.
        return isUniformAcrossVFsAndUFs(R->getOperand(0));
      })
      .Default([](const VPRecipeBase *) { // A value is considered non-uniform
                                          // unless proven otherwise.
        return false;
      });
}

VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
  auto DepthFirst = vp_depth_first_shallow(Plan.getEntry());
  auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) {
    return VPBlockUtils::isHeader(VPB, VPDT);
  });
  return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
}

unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
  if (!R)
    return 1;
  if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
    return RR->getVFScaleFactor();
  if (auto *RR = dyn_cast<VPReductionRecipe>(R))
    return RR->getVFScaleFactor();
  if (auto *ER = dyn_cast<VPExpressionRecipe>(R))
    return ER->getVFScaleFactor();
  assert(
      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
                                     VPInstruction::ReductionStartVector) &&
      "getting scaling factor of reduction-start-vector not implemented yet");
  return 1;
}

bool vputils::cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking) {
  // Assumes don't alias anything or throw; as long as they're guaranteed to
  // execute, they're safe to hoist. They should however not be sunk, as it
  // would destroy information.
  if (match(&R, m_Intrinsic<Intrinsic::assume>()))
    return Sinking;
  // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
  // memory location is not modified in the vector loop.
  if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
    return true;
  // Allocas cannot be hoisted.
  auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
  return RepR && RepR->getOpcode() == Instruction::Alloca;
}

std::optional<VPValue *>
vputils::getRecipesForUncountableExit(SmallVectorImpl<VPInstruction *> &Recipes,
                                      SmallVectorImpl<VPInstruction *> &GEPs,
                                      VPBasicBlock *LatchVPBB) {
  // Given a plain CFG VPlan loop with countable latch exiting block
  // \p LatchVPBB, we're looking to match the recipes contributing to the
  // uncountable exit condition comparison (here, vp<%4>) back to either
  // live-ins or the address nodes for the load used as part of the uncountable
  // exit comparison so that we can either move them within the loop, or copy
  // them to the preheader depending on the chosen method for dealing with
  // stores in uncountable exit loops.
  //
  // Currently, the address of the load is restricted to a GEP with 2 operands
  // and a live-in base address. This constraint may be relaxed later.
  //
  // VPlan ' for UF>=1' {
  // Live-in vp<%0> = VF * UF
  // Live-in vp<%1> = vector-trip-count
  // Live-in ir<20> = original trip-count
  //
  // ir-bb<entry>:
  // Successor(s): scalar.ph, vector.ph
  //
  // vector.ph:
  // Successor(s): for.body
  //
  // for.body:
  //   EMIT vp<%2> = phi ir<0>, vp<%index.next>
  //   EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
  //   EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
  //   EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
  //   EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
  //   EMIT vp<%3> = masked-cond ir<%uncountable.cond>
  // Successor(s): for.inc
  //
  // for.inc:
  //   EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
  //   EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
  //   EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
  //   EMIT vp<%4> = any-of ir<%3>
  //   EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
  //   EMIT vp<%6> = or vp<%4>, vp<%5>
  //   EMIT branch-on-cond vp<%6>
  // Successor(s): middle.block, for.body
  //
  // middle.block:
  // Successor(s): ir-bb<exit>, scalar.ph
  //
  // ir-bb<exit>:
  // No successors
  //
  // scalar.ph:
  // }

  // Find the uncountable loop exit condition.
  VPValue *UncountableCondition = nullptr;
  if (!match(LatchVPBB->getTerminator(),
             m_BranchOnCond(m_c_BinaryOr(
                 m_AnyOf(m_VPValue(UncountableCondition)), m_VPValue()))))
    return std::nullopt;

  SmallVector<VPValue *, 4> Worklist;
  Worklist.push_back(UncountableCondition);
  while (!Worklist.empty()) {
    VPValue *V = Worklist.pop_back_val();

    // Any value defined outside the loop does not need to be copied.
    if (V->isDefinedOutsideLoopRegions())
      continue;

    // FIXME: Remove the single user restriction; it's here because we're
    //        starting with the simplest set of loops we can, and multiple
    //        users means needing to add PHI nodes in the transform.
    if (V->getNumUsers() > 1)
      return std::nullopt;

    VPValue *Op1, *Op2;
    // Walk back through recipes until we find at least one load from memory.
    if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
      Worklist.push_back(Op1);
      Worklist.push_back(Op2);
      Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
    } else if (match(V, m_VPInstruction<Instruction::Load>(m_VPValue(Op1)))) {
      VPRecipeBase *GepR = Op1->getDefiningRecipe();
      // Only matching base + single offset term for now.
      if (GepR->getNumOperands() != 2)
        return std::nullopt;
      // Matching a GEP with a loop-invariant base ptr.
      if (!match(GepR, m_VPInstruction<Instruction::GetElementPtr>(
                           m_LiveIn(), m_VPValue())))
        return std::nullopt;
      Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
      Recipes.push_back(cast<VPInstruction>(GepR));
      GEPs.push_back(cast<VPInstruction>(GepR));
    } else if (match(V, m_VPInstruction<VPInstruction::MaskedCond>(
                            m_VPValue(Op1)))) {
      Worklist.push_back(Op1);
      Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
    } else
      return std::nullopt;
  }

  // If we couldn't match anything, don't return the condition. It may be
  // defined outside the loop.
  if (Recipes.empty() || GEPs.empty())
    return std::nullopt;

  return UncountableCondition;
}

VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) {
  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
  SmallVector<VPValue *> WideCanonicalIVs;
  auto *WideCanonicalIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(
      LoopRegion->getCanonicalIV());
  assert(count_if(LoopRegion->getCanonicalIV()->users(),
                  IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
         "Must have at most one VPWideCanonicalIVRecipe");
  if (WideCanonicalIV)
    WideCanonicalIVs.push_back(WideCanonicalIV);

  // Also include VPWidenIntOrFpInductionRecipes that represent a widened
  // version of the canonical induction.
  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
    if (WidenOriginalIV && WidenOriginalIV->isCanonical())
      WideCanonicalIVs.push_back(WidenOriginalIV);
  }

  // Walk users of wide canonical IVs and find the single compare of the form
  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
  VPSingleDefRecipe *HeaderMask = nullptr;
  for (auto *Wide : WideCanonicalIVs) {
    for (VPUser *U : Wide->users()) {
      auto *VPI = dyn_cast<VPInstruction>(U);
      if (!VPI || !vputils::isHeaderMask(VPI, Plan))
        continue;

      assert(VPI->getOperand(0) == Wide &&
             "WidenCanonicalIV must be the first operand of the compare");
      assert(!HeaderMask && "Multiple header masks found?");
      HeaderMask = VPI;
    }
  }
  return HeaderMask;
}

SmallVector<VPBasicBlock *>
VPBlockUtils::blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB,
                                                  VPBasicBlock *LastBB) {
  assert(FirstBB->getParent() == LastBB->getParent() &&
         "FirstBB and LastBB from different regions");
#ifndef NDEBUG
  bool InSingleSuccChain = false;
  for (VPBlockBase *Succ = FirstBB; Succ; Succ = Succ->getSingleSuccessor())
    InSingleSuccChain |= (Succ == LastBB);
  assert(InSingleSuccChain &&
         "LastBB unreachable from FirstBB in single-successor chain");
#endif
  auto Blocks = to_vector(
      VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_deep(FirstBB)));
  auto *LastIt = find(Blocks, LastBB);
  assert(LastIt != Blocks.end() &&
         "LastBB unreachable from FirstBB in depth-first traversal");
  Blocks.erase(std::next(LastIt), Blocks.end());
  return Blocks;
}

bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
                            const VPDominatorTree &VPDT) {
  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
  if (!VPBB)
    return false;

  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
  // VPBB as its entry, i.e., free of predecessors.
  if (auto *R = VPBB->getParent())
    return !R->isReplicator() && !VPBB->hasPredecessors();

  // A header dominates its second predecessor (the latch), with the other
  // predecessor being the preheader
  return VPB->getPredecessors().size() == 2 &&
         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
}

bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
                           const VPDominatorTree &VPDT) {
  // A latch has a header as its last successor, with its other successors
  // leaving the loop. A preheader OTOH has a header as its first (and only)
  // successor.
  return VPB->getNumSuccessors() >= 2 &&
         VPBlockUtils::isHeader(VPB->getSuccessors().back(), VPDT);
}

std::optional<MemoryLocation>
vputils::getMemoryLocation(const VPRecipeBase &R) {
  auto *M = dyn_cast<VPIRMetadata>(&R);
  if (!M)
    return std::nullopt;
  MemoryLocation Loc;
  // Populate noalias metadata from VPIRMetadata.
  if (MDNode *NoAliasMD = M->getMetadata(LLVMContext::MD_noalias))
    Loc.AATags.NoAlias = NoAliasMD;
  if (MDNode *AliasScopeMD = M->getMetadata(LLVMContext::MD_alias_scope))
    Loc.AATags.Scope = AliasScopeMD;
  return Loc;
}

VPInstruction *vputils::findCanonicalIVIncrement(VPlan &Plan) {
  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
  VPRegionValue *CanIV = LoopRegion->getCanonicalIV();
  assert(CanIV && "Expected loop region to have a canonical IV");

  VPSymbolicValue &VFxUF = Plan.getVFxUF();

  // Check if \p Step matches the expected increment step, accounting for
  // materialization of VFxUF and UF.
  auto IsIncrementStep = [&](VPValue *Step) -> bool {
    if (!VFxUF.isMaterialized())
      return Step == &VFxUF;

    VPSymbolicValue &UF = Plan.getUF();
    if (!UF.isMaterialized())
      return Step == &UF;

    unsigned ConcreteUF = Plan.getConcreteUF();
    // Fixed VF: step is just the concrete UF.
    if (match(Step, m_SpecificInt(ConcreteUF)))
      return true;

    // Scalable VF: step involves VScale.
    if (ConcreteUF == 1)
      return match(Step, m_VPInstruction<VPInstruction::VScale>());
    if (match(Step, m_c_Mul(m_SpecificInt(ConcreteUF),
                            m_VPInstruction<VPInstruction::VScale>())))
      return true;
    // mul(VScale, ConcreteUF) may have been simplified to
    // shl(VScale, log2(ConcreteUF)) when ConcreteUF is a power of 2.
    return isPowerOf2_32(ConcreteUF) &&
           match(Step, m_Binary<Instruction::Shl>(
                           m_VPInstruction<VPInstruction::VScale>(),
                           m_SpecificInt(Log2_32(ConcreteUF))));
  };

  VPInstruction *Increment = nullptr;
  for (VPUser *U : CanIV->users()) {
    VPValue *Step;
    if (match(U, m_c_Add(m_Specific(CanIV), m_VPValue(Step))) &&
        IsIncrementStep(Step)) {
      assert(!Increment && "There must be a unique increment");
      Increment = cast<VPInstruction>(U);
    }
  }

  assert((!VFxUF.isMaterialized() || Increment) &&
         "After materializing VFxUF, an increment must exist");
  assert((!Increment ||
          LoopRegion->hasCanonicalIVNUW() == Increment->hasNoUnsignedWrap()) &&
         "NUW flag in region and increment must match");
  return Increment;
}

/// Find the ComputeReductionResult recipe for \p PhiR, looking through selects
/// inserted for predicated reductions or tail folding.
VPInstruction *vputils::findComputeReductionResult(VPReductionPHIRecipe *PhiR) {
  VPValue *BackedgeVal = PhiR->getBackedgeValue();
  if (auto *Res = vputils::findUserOf<VPInstruction::ComputeReductionResult>(
          BackedgeVal))
    return Res;

  // Look through selects inserted for tail folding or predicated reductions.
  VPRecipeBase *SelR = vputils::findUserOf(
      BackedgeVal, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
  if (!SelR)
    return nullptr;
  return vputils::findUserOf<VPInstruction::ComputeReductionResult>(
      cast<VPSingleDefRecipe>(SelR));
}