llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file provides utility VPlan to VPlan transformations.
//===----------------------------------------------------------------------===//

#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

#include "VPlan.h"
#include "VPlanVerifier.h"
#include "llvm/ADT/STLFunctionalExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Regex.h"

namespace llvm {

class InductionDescriptor;
class Instruction;
class Loop;
class LoopVersioning;
class OptimizationRemarkEmitter;
class PHINode;
class ScalarEvolution;
class PredicatedScalarEvolution;
class TargetLibraryInfo;
class TargetTransformInfo;
class VPBuilder;
class VPRecipeBuilder;
struct VFRange;

LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintAfterAll;
LLVM_ABI_FOR_TEST extern cl::list<std::string> VPlanPrintAfterPasses;
LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintVectorRegionScope;
#endif

struct VPlanTransforms {
  /// Helper to run a VPlan pass \p Pass on \p VPlan, forwarding extra arguments
  /// to the pass. Performs verification/printing after each VPlan pass if
  /// requested via command line options.
  template <bool EnableVerify = true, typename PassTy, typename... ArgsTy>
  static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan,
                                ArgsTy &&...Args) {
    scope_exit PostTransformActions{[&]() {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
      // Make sure to print before verification, so that output is more useful
      // in case of failures:
      if (VPlanPrintAfterAll ||
          (VPlanPrintAfterPasses.getNumOccurrences() > 0 &&
           any_of(VPlanPrintAfterPasses, [PassName](StringRef Entry) {
             return Regex(Entry).match(PassName);
           }))) {
        dbgs()
            << "VPlan for loop in '"
            << Plan.getScalarHeader()->getIRBasicBlock()->getParent()->getName()
            << "' after " << PassName << '\n';
        if (VPlanPrintVectorRegionScope && Plan.getVectorLoopRegion())
          Plan.getVectorLoopRegion()->print(dbgs());
        else
          dbgs() << Plan << '\n';
      }
#endif
      if (VerifyEachVPlan && EnableVerify) {
        if (!verifyVPlanIsValid(Plan))
          report_fatal_error("Broken VPlan found, compilation aborted!");
      }
    }};

    return std::forward<PassTy>(Pass)(Plan, std::forward<ArgsTy>(Args)...);
  }
#define RUN_VPLAN_PASS(PASS, ...)                                              \
  llvm::VPlanTransforms::runPass(#PASS, PASS, __VA_ARGS__)
#define RUN_VPLAN_PASS_NO_VERIFY(PASS, ...)                                    \
  llvm::VPlanTransforms::runPass<false>(#PASS, PASS, __VA_ARGS__)

  /// Create a base VPlan0, serving as the common starting point for all later
  /// candidates. It consists of an initial plain CFG loop with loop blocks from
  /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
  /// corresponding to the input IR.
  ///
  /// The created loop is wrapped in an initial skeleton to facilitate
  /// vectorization, consisting of a vector pre-header, an exit block for the
  /// main vector loop (middle.block) and a new block as preheader of the scalar
  /// loop (scalar.ph). See below for an illustration. It also adds a canonical
  /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
  /// VPValue expression for the original trip count.
  ///
  ///    [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
  ///    / \       old preheader. Will contain iteration number check and SCEV
  ///   |   |      expansions.
  ///   |   |
  ///   /   v
  ///  |   [ ] <-- vector loop bypass (may consist of multiple blocks) will be
  ///  |  / |      added later.
  ///  | /  v
  ///  ||  [ ]     <-- vector pre header.
  ///  |/   |
  ///  |    v
  ///  |   [  ] \  <-- plain CFG loop wrapping original loop to be vectorized.
  ///  |   [  ]_|
  ///  |    |
  ///  |    v
  ///  |   [ ]   <--- middle-block with the branch to successors
  ///  |   / |
  ///  |  /  |
  ///  | |   v
  ///  \--->[ ]   <--- scalar preheader (initial a VPBasicBlock, which will be
  ///    |   |        replaced later by a VPIRBasicBlock wrapping the scalar
  ///    |   |         preheader basic block.
  ///    |   |
  ///        v      <-- edge from middle to exit iff epilogue is not required.
  ///    |  [ ] \
  ///    |  [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue,
  ///    |   |          header wrapped in VPIRBasicBlock).
  ///    \   |
  ///     \  v
  ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
  LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
  buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
              PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);

  /// Replace VPPhi recipes in \p Plan's header with corresponding
  /// VPHeaderPHIRecipe subclasses for inductions, reductions, and
  /// fixed-order recurrences. This processes all header phis and creates
  /// the appropriate widened recipe for each one. For fixed-order
  /// recurrences, also creates FirstOrderRecurrenceSplice instructions and
  /// sinks/hoists users as needed. Returns false if any fixed-order
  /// recurrence cannot be handled.
  static bool createHeaderPhiRecipes(
      VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
      const MapVector<PHINode *, InductionDescriptor> &Inductions,
      const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
      const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
      const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering);

  /// Create VPReductionRecipes for in-loop reductions. This processes chains
  /// of operations contributing to in-loop reductions and creates appropriate
  /// VPReductionRecipe instances.
  static void createInLoopReductionRecipes(
      VPlan &Plan, const DenseSet<BasicBlock *> &BlocksNeedingPredication,
      ElementCount MinVF);

  /// Update \p Plan to account for all early exits. If \p Style is not
  /// NoUncountableExit, handles uncountable early exits and checks that all
  /// loads are dereferenceable. Returns false if a non-dereferenceable load is
  /// found.
  LLVM_ABI_FOR_TEST static bool
  handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop,
                   PredicatedScalarEvolution &PSE, DominatorTree &DT,
                   AssumptionCache *AC);

  /// If a check is needed to guard executing the scalar epilogue loop, it will
  /// be added to the middle block.
  LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan, bool TailFolded);

  // Create a check to \p Plan to see if the vector loop should be executed.
  // If \p CheckBlock is non-null, the compare and branch are placed there;
  // ExpandSCEV recipes are always placed in Entry.
  static void addMinimumIterationCheck(
      VPlan &Plan, ElementCount VF, unsigned UF,
      ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
      bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights,
      DebugLoc DL, PredicatedScalarEvolution &PSE,
      VPBasicBlock *CheckBlock = nullptr);

  /// Add a new check block before the vector preheader to \p Plan to check if
  /// the main vector loop should be executed (TC >= VF * UF).
  static void
  addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF,
                              bool RequiresScalarEpilogue, Loop *OrigLoop,
                              const uint32_t *MinItersBypassWeights,
                              DebugLoc DL, PredicatedScalarEvolution &PSE);

  /// Add a check to \p Plan to see if the epilogue vector loop should be
  /// executed.
  static void addMinimumVectorEpilogueIterationCheck(
      VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue,
      ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep,
      unsigned EpilogueLoopStep, ScalarEvolution &SE);

  /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
  /// flat CFG into a hierarchical CFG.
  LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);

  /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
  /// VPValue and connect the block to \p Plan, using the VPValue as branch
  /// condition.
  static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
                               bool AddBranchWeights);

  /// Replaces the VPInstructions in \p Plan with corresponding
  /// widen recipes. Returns false if any VPInstructions could not be converted
  /// to a wide recipe if needed.
  LLVM_ABI_FOR_TEST static bool
  tryToConvertVPInstructionsToVPRecipes(VPlan &Plan,
                                        const TargetLibraryInfo &TLI);

  /// Try to legalize reductions with multiple in-loop uses. Currently only
  /// strict and non-strict min/max reductions used by FindLastIV reductions are
  /// supported, corresponding to computing the first and last argmin/argmax,
  /// respectively. Otherwise return false.
  static bool handleMultiUseReductions(VPlan &Plan,
                                       OptimizationRemarkEmitter *ORE,
                                       Loop *TheLoop);

  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
  /// try to update the vector loop to exit early if any input is NaN and resume
  /// executing in the scalar loop to handle the NaNs there. Return false if
  /// this attempt was unsuccessful.
  static bool handleMaxMinNumReductions(VPlan &Plan);

  /// Check if \p Plan contains any FindLast reductions. If it does, try to
  /// update the vector loop to save the appropriate state using selects
  /// for entire vectors for both the latest mask containing at least one active
  /// element and the corresponding data vector. Return false if this attempt
  /// was unsuccessful.
  static bool handleFindLastReductions(VPlan &Plan);

  /// Clear NSW/NUW flags from reduction instructions if necessary.
  static void clearReductionWrapFlags(VPlan &Plan);

  /// Explicitly unroll \p Plan by \p UF.
  static void unrollByUF(VPlan &Plan, unsigned UF);

  /// Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and
  /// VPInstruction in \p Plan with \p VF single-scalar recipes. Replicate
  /// regions are dissolved by replicating their blocks and their recipes \p VF
  /// times.
  /// TODO: Also dissolve replicate regions with live outs.
  static void replicateByVF(VPlan &Plan, ElementCount VF);

  /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
  /// resulting plan to \p BestVF and \p BestUF.
  static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                 unsigned BestUF,
                                 PredicatedScalarEvolution &PSE);

  /// Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL
  /// is known to be <= VF, replacing them with the AVL directly.
  static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
                               PredicatedScalarEvolution &PSE);

  /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
  /// optimizations, dead recipe removal, replicate region optimizations and
  /// block merging.
  LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);

  /// Remove redundant VPBasicBlocks by merging them into their single
  /// predecessor if the latter has a single successor.
  static bool mergeBlocksIntoPredecessors(VPlan &Plan);

  /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
  /// region block and remove the mask operand. Optimize the created regions by
  /// iteratively sinking scalar operands into the region, followed by merging
  /// regions until no improvements are remaining.
  static void createAndOptimizeReplicateRegions(VPlan &Plan);

  /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
  /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
  /// UseActiveLaneMaskForControlFlow is true, introduce an
  /// VPActiveLaneMaskPHIRecipe.
  static void addActiveLaneMask(VPlan &Plan,
                                bool UseActiveLaneMaskForControlFlow);

  /// Insert truncates and extends for any truncated recipe. Redundant casts
  /// will be folded later.
  static void
  truncateToMinimalBitwidths(VPlan &Plan,
                             const MapVector<Instruction *, uint64_t> &MinBWs);

  /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
  /// possible.
  static void
  replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
                         const DenseMap<Value *, const SCEV *> &StridesMap);

  /// Drop poison flags from recipes that may generate a poison value that is
  /// used after vectorization, even when their operands are not poison. Those
  /// recipes meet the following conditions:
  ///  * Contribute to the address computation of a recipe generating a widen
  ///    memory load/store (VPWidenMemoryInstructionRecipe or
  ///    VPInterleaveRecipe).
  ///  * Such a widen memory load/store has at least one underlying Instruction
  ///    that is in a basic block that needs predication and after vectorization
  ///    the generated instruction won't be predicated.
  /// Uses \p BlockNeedsPredication to check if a block needs predicating.
  /// TODO: Replace BlockNeedsPredication callback with retrieving info from
  ///       VPlan directly.
  static void dropPoisonGeneratingRecipes(
      VPlan &Plan,
      const std::function<bool(BasicBlock *)> &BlockNeedsPredication);

  /// Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
  /// replaces all uses of the canonical IV except for the canonical IV
  /// increment with a VPCurrentIterationPHIRecipe. The canonical IV is only
  /// used to control the loop after this transformation.
  static void
  addExplicitVectorLength(VPlan &Plan,
                          const std::optional<unsigned> &MaxEVLSafeElements);

  /// Optimize recipes which use an EVL-based header mask to VP intrinsics, for
  /// example:
  ///
  /// %mask = icmp ult step-vector, EVL
  /// %load = load %ptr, %mask
  /// -->
  /// %load = vp.load %ptr, EVL
  static void optimizeEVLMasks(VPlan &Plan);

  // For each Interleave Group in \p InterleaveGroups replace the Recipes
  // widening its memory instructions with a single VPInterleaveRecipe at its
  // insertion point.
  static void createInterleaveGroups(
      VPlan &Plan,
      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
          &InterleaveGroups,
      VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed);

  /// Remove dead recipes from \p Plan.
  static void removeDeadRecipes(VPlan &Plan);

  /// Update \p Plan to account for uncountable early exits by introducing
  /// appropriate branching logic in the latch that handles early exits and the
  /// latch exit condition. Multiple exits are handled with a dispatch block
  /// that determines which exit to take based on lane-by-lane semantics.
  static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB,
                                          VPBasicBlock *LatchVPBB,
                                          VPBasicBlock *MiddleVPBB,
                                          UncountableExitStyle Style);

  /// Replaces the exit condition from
  ///   (branch-on-cond eq CanonicalIVInc, VectorTripCount)
  /// to
  ///   (branch-on-cond eq AVLNext, 0)
  static void convertEVLExitCond(VPlan &Plan);

  /// Replace loop regions with explicit CFG.
  static void dissolveLoopRegions(VPlan &Plan);

  /// Expand BranchOnTwoConds instructions into explicit CFG with
  /// BranchOnCond instructions. Should be called after dissolveLoopRegions.
  static void expandBranchOnTwoConds(VPlan &Plan);

  /// Transform loops with variable-length stepping after region
  /// dissolution.
  ///
  /// Once loop regions are replaced with explicit CFG, loops can step with
  /// variable vector lengths instead of fixed lengths. This transformation:
  ///  * Makes CurrentIteration-Phi concrete.
  //   * Removes CanonicalIV and increment.
  static void convertToVariableLengthStep(VPlan &Plan);

  /// Lower abstract recipes to concrete ones, that can be codegen'd.
  static void convertToConcreteRecipes(VPlan &Plan);

  /// This function converts initial recipes to the abstract recipes and clamps
  /// \p Range based on cost model for following optimizations and cost
  /// estimations. The converted abstract recipes will lower to concrete
  /// recipes before codegen.
  static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                       VFRange &Range);

  /// Perform instcombine-like simplifications on recipes in \p Plan.
  static void simplifyRecipes(VPlan &Plan);

  /// Remove BranchOnCond recipes with true or false conditions together with
  /// removing dead edges to their successors. If \p OnlyLatches is true, only
  /// process loop latches.
  static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches = false);

  /// Perform common-subexpression-elimination on \p Plan.
  static void cse(VPlan &Plan);

  /// If there's a single exit block, optimize its phi recipes that use exiting
  /// IV values by feeding them precomputed end values instead, possibly taken
  /// one step backwards.
  static void optimizeInductionLiveOutUsers(VPlan &Plan,
                                            PredicatedScalarEvolution &PSE,
                                            bool FoldTail);

  /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
  static void materializeBroadcasts(VPlan &Plan);

  /// Hoist single-scalar loads with invariant addresses out of the vector loop
  /// to the preheader, if they are proven not to alias with any stores in the
  /// plan using noalias metadata.
  static void hoistInvariantLoads(VPlan &Plan);

  /// Hoist predicated loads from the same address to the loop entry block, if
  /// they are guaranteed to execute on both paths (i.e., in replicate regions
  /// with complementary masks P and NOT P).
  static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE,
                                   const Loop *L);

  /// Sink predicated stores to the same address with complementary predicates
  /// (P and NOT P) to an unconditional store with select recipes for the
  /// stored values. This eliminates branching overhead when all paths
  /// unconditionally store to the same location.
  static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE,
                                   const Loop *L);

  // Materialize vector trip counts for constants early if it can simply be
  // computed as (Original TC / VF * UF) * VF * UF.
  static void
  materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
                                     unsigned BestUF,
                                     PredicatedScalarEvolution &PSE);

  /// Materialize vector trip count computations to a set of VPInstructions.
  /// \p Step is used as the step value for the trip count computation.
  /// \p MaxRuntimeStep is the maximum possible runtime value of Step, used to
  /// prove the trip count is divisible by the step for scalable VFs.
  static void materializeVectorTripCount(
      VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
      bool RequiresScalarEpilogue, VPValue *Step,
      std::optional<uint64_t> MaxRuntimeStep = std::nullopt);

  /// Materialize the backedge-taken count to be computed explicitly using
  /// VPInstructions.
  static void materializeBackedgeTakenCount(VPlan &Plan,
                                            VPBasicBlock *VectorPH);

  /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
  /// into vectors and Unpack recipes to extract scalars from vectors as
  /// needed.
  static void materializePacksAndUnpacks(VPlan &Plan);

  /// Materialize UF, VF and VFxUF to be computed explicitly using
  /// VPInstructions.
  static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
                                 ElementCount VF);

  /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
  /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
  /// value. A mapping from SCEV expressions to their expanded IR value is
  /// returned.
  static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
                                                     ScalarEvolution &SE);

  /// Try to find a single VF among \p Plan's VFs for which all interleave
  /// groups (with known minimum VF elements) can be replaced by wide loads and
  /// stores processing VF elements, if all transformed interleave groups access
  /// the full vector width (checked via the maximum vector register width). If
  /// the transformation can be applied, the original \p Plan will be split in
  /// 2:
  ///  1. The original Plan with the single VF containing the optimized recipes
  ///  using wide loads instead of interleave groups.
  ///  2. A new clone which contains all VFs of Plan except the optimized VF.
  ///
  /// This effectively is a very simple form of loop-aware SLP, where we use
  /// interleave groups to identify candidates.
  static std::unique_ptr<VPlan>
  narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);

  /// Adapts the vector loop region for tail folding by introducing a header
  /// mask and conditionally executing the content of the region:
  ///
  /// Vector loop region before:
  /// +-------------------------------------------+
  /// |%iv = ...                                  |
  /// |...                                        |
  /// |%iv.next = add %iv, vfxuf                  |
  /// |branch-on-count %iv.next, vector-trip-count|
  /// +-------------------------------------------+
  ///
  /// Vector loop region after:
  /// +-------------------------------------------+
  /// |%iv = ...                                  |
  /// |%wide.iv = widen-canonical-iv ...          |
  /// |%header-mask = icmp ule %wide.iv, BTC      |
  /// |branch-on-cond %header-mask                |---+
  /// +-------------------------------------------+   |
  ///                      |                          |
  ///                      v                          |
  /// +-------------------------------------------+   |
  /// |                   ...                     |   |
  /// +-------------------------------------------+   |
  ///                      |                          |
  ///                      v                          |
  /// +-------------------------------------------+   |
  /// |<phis> = phi [..., ...], [poison, header]  |
  /// |%iv.next = add %iv, vfxuf                  |<--+
  /// |branch-on-count %iv.next, vector-trip-count|
  /// +-------------------------------------------+
  ///
  /// Any VPInstruction::ExtractLastLanes are also updated to extract from the
  /// last active lane of the header mask.
  static void foldTailByMasking(VPlan &Plan);

  /// Predicate and linearize the control-flow in the only loop region of
  /// \p Plan.
  static void introduceMasksAndLinearize(VPlan &Plan);

  /// Replace a VPWidenCanonicalIVRecipe if it is present in \p Plan, with a
  /// VPWidenIntOrFpInductionRecipe, provided it would not cause additional
  /// spills for \p VF at unroll factor \p UF.
  static void replaceWideCanonicalIVWithWideIV(
      VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
      TargetTransformInfo::TargetCostKind CostKind, ElementCount VF,
      unsigned UF, const SmallPtrSetImpl<const Value *> &ValuesToIgnore);

  /// Add branch weight metadata, if the \p Plan's middle block is terminated by
  /// a BranchOnCond recipe.
  static void
  addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
                                    std::optional<unsigned> VScaleForTuning);

  /// Adjust first-order recurrence users in the middle block: create
  /// penultimate element extracts for LCSSA phi users, and handle penultimate
  /// extracts of the last active lane edge.
  static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan,
                                                    VFRange &Range);

  /// Optimize FindLast reductions selecting IVs (or expressions of IVs) by
  /// converting them to FindIV reductions, if their IV range excludes a
  /// suitable sentinel value. For expressions of IVs, the expression is sunk
  /// to the middle block.
  static void optimizeFindIVReductions(VPlan &Plan,
                                       PredicatedScalarEvolution &PSE, Loop &L);

  /// Detect and create partial reduction recipes for scaled reductions in
  /// \p Plan. Must be called after recipe construction. If partial reductions
  /// are only valid for a subset of VFs in Range, Range.End is updated.
  static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
                                      VFRange &Range);

  /// Convert load/store VPInstructions in \p Plan into widened or replicate
  /// recipes. Non load/store input instructions are left unchanged.
  static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
                                         VPRecipeBuilder &RecipeBuilder);
};

} // namespace llvm

#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H