Add a new cost-based transform that replaces VPWidenCanonicalIVRecipe with a canonical VPWidenIntOrFpInductionPHIRecipe, if it does not increase spills. The main benefit of VPWidenCanonicalIVRecipe is that it has shorter live-ranges than wide IV phis. The new transform introduces wide IV unless VPWidenCanonicalIVRecipe is cheaper or the wide IV introduces additional spills. This introduces wide IVs in a number of cases, where previously had VPWidenCanonicalIVRecipe, because there was no existing wide canonical IV we could re-use. It should also help avoid somewhat unrelated changes in https://github.com/llvm/llvm-project/pull/190191. PR: https://github.com/llvm/llvm-project/pull/194267
549 lines
26 KiB
C++
549 lines
26 KiB
C++
//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file provides utility VPlan to VPlan transformations.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
|
|
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
|
|
|
|
#include "VPlan.h"
|
|
#include "VPlanVerifier.h"
|
|
#include "llvm/ADT/STLFunctionalExtras.h"
|
|
#include "llvm/ADT/ScopeExit.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Compiler.h"
|
|
#include "llvm/Support/Regex.h"
|
|
|
|
namespace llvm {
|
|
|
|
class InductionDescriptor;
|
|
class Instruction;
|
|
class Loop;
|
|
class LoopVersioning;
|
|
class OptimizationRemarkEmitter;
|
|
class PHINode;
|
|
class ScalarEvolution;
|
|
class PredicatedScalarEvolution;
|
|
class TargetLibraryInfo;
|
|
class TargetTransformInfo;
|
|
class VPBuilder;
|
|
class VPRecipeBuilder;
|
|
struct VFRange;
|
|
|
|
LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
|
|
LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintAfterAll;
|
|
LLVM_ABI_FOR_TEST extern cl::list<std::string> VPlanPrintAfterPasses;
|
|
LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintVectorRegionScope;
|
|
#endif
|
|
|
|
struct VPlanTransforms {
|
|
/// Helper to run a VPlan pass \p Pass on \p VPlan, forwarding extra arguments
|
|
/// to the pass. Performs verification/printing after each VPlan pass if
|
|
/// requested via command line options.
|
|
template <bool EnableVerify = true, typename PassTy, typename... ArgsTy>
|
|
static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan,
|
|
ArgsTy &&...Args) {
|
|
scope_exit PostTransformActions{[&]() {
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
// Make sure to print before verification, so that output is more useful
|
|
// in case of failures:
|
|
if (VPlanPrintAfterAll ||
|
|
(VPlanPrintAfterPasses.getNumOccurrences() > 0 &&
|
|
any_of(VPlanPrintAfterPasses, [PassName](StringRef Entry) {
|
|
return Regex(Entry).match(PassName);
|
|
}))) {
|
|
dbgs()
|
|
<< "VPlan for loop in '"
|
|
<< Plan.getScalarHeader()->getIRBasicBlock()->getParent()->getName()
|
|
<< "' after " << PassName << '\n';
|
|
if (VPlanPrintVectorRegionScope && Plan.getVectorLoopRegion())
|
|
Plan.getVectorLoopRegion()->print(dbgs());
|
|
else
|
|
dbgs() << Plan << '\n';
|
|
}
|
|
#endif
|
|
if (VerifyEachVPlan && EnableVerify) {
|
|
if (!verifyVPlanIsValid(Plan))
|
|
report_fatal_error("Broken VPlan found, compilation aborted!");
|
|
}
|
|
}};
|
|
|
|
return std::forward<PassTy>(Pass)(Plan, std::forward<ArgsTy>(Args)...);
|
|
}
|
|
#define RUN_VPLAN_PASS(PASS, ...) \
|
|
llvm::VPlanTransforms::runPass(#PASS, PASS, __VA_ARGS__)
|
|
#define RUN_VPLAN_PASS_NO_VERIFY(PASS, ...) \
|
|
llvm::VPlanTransforms::runPass<false>(#PASS, PASS, __VA_ARGS__)
|
|
|
|
/// Create a base VPlan0, serving as the common starting point for all later
|
|
/// candidates. It consists of an initial plain CFG loop with loop blocks from
|
|
/// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
|
|
/// corresponding to the input IR.
|
|
///
|
|
/// The created loop is wrapped in an initial skeleton to facilitate
|
|
/// vectorization, consisting of a vector pre-header, an exit block for the
|
|
/// main vector loop (middle.block) and a new block as preheader of the scalar
|
|
/// loop (scalar.ph). See below for an illustration. It also adds a canonical
|
|
/// IV and its increment, using \p InductionTy and \p IVDL, and creates a
|
|
/// VPValue expression for the original trip count.
|
|
///
|
|
/// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
|
|
/// / \ old preheader. Will contain iteration number check and SCEV
|
|
/// | | expansions.
|
|
/// | |
|
|
/// / v
|
|
/// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
|
|
/// | / | added later.
|
|
/// | / v
|
|
/// || [ ] <-- vector pre header.
|
|
/// |/ |
|
|
/// | v
|
|
/// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
|
|
/// | [ ]_|
|
|
/// | |
|
|
/// | v
|
|
/// | [ ] <--- middle-block with the branch to successors
|
|
/// | / |
|
|
/// | / |
|
|
/// | | v
|
|
/// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
|
|
/// | | replaced later by a VPIRBasicBlock wrapping the scalar
|
|
/// | | preheader basic block.
|
|
/// | |
|
|
/// v <-- edge from middle to exit iff epilogue is not required.
|
|
/// | [ ] \
|
|
/// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
|
|
/// | | header wrapped in VPIRBasicBlock).
|
|
/// \ |
|
|
/// \ v
|
|
/// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
|
|
LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
|
|
buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
|
|
PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
|
|
|
|
/// Replace VPPhi recipes in \p Plan's header with corresponding
|
|
/// VPHeaderPHIRecipe subclasses for inductions, reductions, and
|
|
/// fixed-order recurrences. This processes all header phis and creates
|
|
/// the appropriate widened recipe for each one. For fixed-order
|
|
/// recurrences, also creates FirstOrderRecurrenceSplice instructions and
|
|
/// sinks/hoists users as needed. Returns false if any fixed-order
|
|
/// recurrence cannot be handled.
|
|
static bool createHeaderPhiRecipes(
|
|
VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
|
|
const MapVector<PHINode *, InductionDescriptor> &Inductions,
|
|
const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
|
|
const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
|
|
const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering);
|
|
|
|
/// Create VPReductionRecipes for in-loop reductions. This processes chains
|
|
/// of operations contributing to in-loop reductions and creates appropriate
|
|
/// VPReductionRecipe instances.
|
|
static void createInLoopReductionRecipes(
|
|
VPlan &Plan, const DenseSet<BasicBlock *> &BlocksNeedingPredication,
|
|
ElementCount MinVF);
|
|
|
|
/// Update \p Plan to account for all early exits. If \p Style is not
|
|
/// NoUncountableExit, handles uncountable early exits and checks that all
|
|
/// loads are dereferenceable. Returns false if a non-dereferenceable load is
|
|
/// found.
|
|
LLVM_ABI_FOR_TEST static bool
|
|
handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop,
|
|
PredicatedScalarEvolution &PSE, DominatorTree &DT,
|
|
AssumptionCache *AC);
|
|
|
|
/// If a check is needed to guard executing the scalar epilogue loop, it will
|
|
/// be added to the middle block.
|
|
LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan, bool TailFolded);
|
|
|
|
// Create a check to \p Plan to see if the vector loop should be executed.
|
|
// If \p CheckBlock is non-null, the compare and branch are placed there;
|
|
// ExpandSCEV recipes are always placed in Entry.
|
|
static void addMinimumIterationCheck(
|
|
VPlan &Plan, ElementCount VF, unsigned UF,
|
|
ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
|
|
bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights,
|
|
DebugLoc DL, PredicatedScalarEvolution &PSE,
|
|
VPBasicBlock *CheckBlock = nullptr);
|
|
|
|
/// Add a new check block before the vector preheader to \p Plan to check if
|
|
/// the main vector loop should be executed (TC >= VF * UF).
|
|
static void
|
|
addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF,
|
|
bool RequiresScalarEpilogue, Loop *OrigLoop,
|
|
const uint32_t *MinItersBypassWeights,
|
|
DebugLoc DL, PredicatedScalarEvolution &PSE);
|
|
|
|
/// Add a check to \p Plan to see if the epilogue vector loop should be
|
|
/// executed.
|
|
static void addMinimumVectorEpilogueIterationCheck(
|
|
VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue,
|
|
ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep,
|
|
unsigned EpilogueLoopStep, ScalarEvolution &SE);
|
|
|
|
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
|
|
/// flat CFG into a hierarchical CFG.
|
|
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
|
|
|
|
/// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
|
|
/// VPValue and connect the block to \p Plan, using the VPValue as branch
|
|
/// condition.
|
|
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
|
|
bool AddBranchWeights);
|
|
|
|
/// Replaces the VPInstructions in \p Plan with corresponding
|
|
/// widen recipes. Returns false if any VPInstructions could not be converted
|
|
/// to a wide recipe if needed.
|
|
LLVM_ABI_FOR_TEST static bool
|
|
tryToConvertVPInstructionsToVPRecipes(VPlan &Plan,
|
|
const TargetLibraryInfo &TLI);
|
|
|
|
/// Try to legalize reductions with multiple in-loop uses. Currently only
|
|
/// strict and non-strict min/max reductions used by FindLastIV reductions are
|
|
/// supported, corresponding to computing the first and last argmin/argmax,
|
|
/// respectively. Otherwise return false.
|
|
static bool handleMultiUseReductions(VPlan &Plan,
|
|
OptimizationRemarkEmitter *ORE,
|
|
Loop *TheLoop);
|
|
|
|
/// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
|
|
/// try to update the vector loop to exit early if any input is NaN and resume
|
|
/// executing in the scalar loop to handle the NaNs there. Return false if
|
|
/// this attempt was unsuccessful.
|
|
static bool handleMaxMinNumReductions(VPlan &Plan);
|
|
|
|
/// Check if \p Plan contains any FindLast reductions. If it does, try to
|
|
/// update the vector loop to save the appropriate state using selects
|
|
/// for entire vectors for both the latest mask containing at least one active
|
|
/// element and the corresponding data vector. Return false if this attempt
|
|
/// was unsuccessful.
|
|
static bool handleFindLastReductions(VPlan &Plan);
|
|
|
|
/// Clear NSW/NUW flags from reduction instructions if necessary.
|
|
static void clearReductionWrapFlags(VPlan &Plan);
|
|
|
|
/// Explicitly unroll \p Plan by \p UF.
|
|
static void unrollByUF(VPlan &Plan, unsigned UF);
|
|
|
|
/// Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and
|
|
/// VPInstruction in \p Plan with \p VF single-scalar recipes. Replicate
|
|
/// regions are dissolved by replicating their blocks and their recipes \p VF
|
|
/// times.
|
|
/// TODO: Also dissolve replicate regions with live outs.
|
|
static void replicateByVF(VPlan &Plan, ElementCount VF);
|
|
|
|
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
|
|
/// resulting plan to \p BestVF and \p BestUF.
|
|
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
|
|
unsigned BestUF,
|
|
PredicatedScalarEvolution &PSE);
|
|
|
|
/// Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL
|
|
/// is known to be <= VF, replacing them with the AVL directly.
|
|
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
|
|
PredicatedScalarEvolution &PSE);
|
|
|
|
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
|
|
/// optimizations, dead recipe removal, replicate region optimizations and
|
|
/// block merging.
|
|
LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
|
|
|
|
/// Remove redundant VPBasicBlocks by merging them into their single
|
|
/// predecessor if the latter has a single successor.
|
|
static bool mergeBlocksIntoPredecessors(VPlan &Plan);
|
|
|
|
/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
|
|
/// region block and remove the mask operand. Optimize the created regions by
|
|
/// iteratively sinking scalar operands into the region, followed by merging
|
|
/// regions until no improvements are remaining.
|
|
static void createAndOptimizeReplicateRegions(VPlan &Plan);
|
|
|
|
/// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
|
|
/// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
|
|
/// UseActiveLaneMaskForControlFlow is true, introduce an
|
|
/// VPActiveLaneMaskPHIRecipe.
|
|
static void addActiveLaneMask(VPlan &Plan,
|
|
bool UseActiveLaneMaskForControlFlow);
|
|
|
|
/// Insert truncates and extends for any truncated recipe. Redundant casts
|
|
/// will be folded later.
|
|
static void
|
|
truncateToMinimalBitwidths(VPlan &Plan,
|
|
const MapVector<Instruction *, uint64_t> &MinBWs);
|
|
|
|
/// Replace symbolic strides from \p StridesMap in \p Plan with constants when
|
|
/// possible.
|
|
static void
|
|
replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
|
|
const DenseMap<Value *, const SCEV *> &StridesMap);
|
|
|
|
/// Drop poison flags from recipes that may generate a poison value that is
|
|
/// used after vectorization, even when their operands are not poison. Those
|
|
/// recipes meet the following conditions:
|
|
/// * Contribute to the address computation of a recipe generating a widen
|
|
/// memory load/store (VPWidenMemoryInstructionRecipe or
|
|
/// VPInterleaveRecipe).
|
|
/// * Such a widen memory load/store has at least one underlying Instruction
|
|
/// that is in a basic block that needs predication and after vectorization
|
|
/// the generated instruction won't be predicated.
|
|
/// Uses \p BlockNeedsPredication to check if a block needs predicating.
|
|
/// TODO: Replace BlockNeedsPredication callback with retrieving info from
|
|
/// VPlan directly.
|
|
static void dropPoisonGeneratingRecipes(
|
|
VPlan &Plan,
|
|
const std::function<bool(BasicBlock *)> &BlockNeedsPredication);
|
|
|
|
/// Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
|
|
/// replaces all uses of the canonical IV except for the canonical IV
|
|
/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is only
|
|
/// used to control the loop after this transformation.
|
|
static void
|
|
addExplicitVectorLength(VPlan &Plan,
|
|
const std::optional<unsigned> &MaxEVLSafeElements);
|
|
|
|
/// Optimize recipes which use an EVL-based header mask to VP intrinsics, for
|
|
/// example:
|
|
///
|
|
/// %mask = icmp ult step-vector, EVL
|
|
/// %load = load %ptr, %mask
|
|
/// -->
|
|
/// %load = vp.load %ptr, EVL
|
|
static void optimizeEVLMasks(VPlan &Plan);
|
|
|
|
// For each Interleave Group in \p InterleaveGroups replace the Recipes
|
|
// widening its memory instructions with a single VPInterleaveRecipe at its
|
|
// insertion point.
|
|
static void createInterleaveGroups(
|
|
VPlan &Plan,
|
|
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
|
|
&InterleaveGroups,
|
|
VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed);
|
|
|
|
/// Remove dead recipes from \p Plan.
|
|
static void removeDeadRecipes(VPlan &Plan);
|
|
|
|
/// Update \p Plan to account for uncountable early exits by introducing
|
|
/// appropriate branching logic in the latch that handles early exits and the
|
|
/// latch exit condition. Multiple exits are handled with a dispatch block
|
|
/// that determines which exit to take based on lane-by-lane semantics.
|
|
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB,
|
|
VPBasicBlock *LatchVPBB,
|
|
VPBasicBlock *MiddleVPBB,
|
|
UncountableExitStyle Style);
|
|
|
|
/// Replaces the exit condition from
|
|
/// (branch-on-cond eq CanonicalIVInc, VectorTripCount)
|
|
/// to
|
|
/// (branch-on-cond eq AVLNext, 0)
|
|
static void convertEVLExitCond(VPlan &Plan);
|
|
|
|
/// Replace loop regions with explicit CFG.
|
|
static void dissolveLoopRegions(VPlan &Plan);
|
|
|
|
/// Expand BranchOnTwoConds instructions into explicit CFG with
|
|
/// BranchOnCond instructions. Should be called after dissolveLoopRegions.
|
|
static void expandBranchOnTwoConds(VPlan &Plan);
|
|
|
|
/// Transform loops with variable-length stepping after region
|
|
/// dissolution.
|
|
///
|
|
/// Once loop regions are replaced with explicit CFG, loops can step with
|
|
/// variable vector lengths instead of fixed lengths. This transformation:
|
|
/// * Makes CurrentIteration-Phi concrete.
|
|
// * Removes CanonicalIV and increment.
|
|
static void convertToVariableLengthStep(VPlan &Plan);
|
|
|
|
/// Lower abstract recipes to concrete ones, that can be codegen'd.
|
|
static void convertToConcreteRecipes(VPlan &Plan);
|
|
|
|
/// This function converts initial recipes to the abstract recipes and clamps
|
|
/// \p Range based on cost model for following optimizations and cost
|
|
/// estimations. The converted abstract recipes will lower to concrete
|
|
/// recipes before codegen.
|
|
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
|
|
VFRange &Range);
|
|
|
|
/// Perform instcombine-like simplifications on recipes in \p Plan.
|
|
static void simplifyRecipes(VPlan &Plan);
|
|
|
|
/// Remove BranchOnCond recipes with true or false conditions together with
|
|
/// removing dead edges to their successors. If \p OnlyLatches is true, only
|
|
/// process loop latches.
|
|
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches = false);
|
|
|
|
/// Perform common-subexpression-elimination on \p Plan.
|
|
static void cse(VPlan &Plan);
|
|
|
|
/// If there's a single exit block, optimize its phi recipes that use exiting
|
|
/// IV values by feeding them precomputed end values instead, possibly taken
|
|
/// one step backwards.
|
|
static void optimizeInductionLiveOutUsers(VPlan &Plan,
|
|
PredicatedScalarEvolution &PSE,
|
|
bool FoldTail);
|
|
|
|
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
|
|
static void materializeBroadcasts(VPlan &Plan);
|
|
|
|
/// Hoist single-scalar loads with invariant addresses out of the vector loop
|
|
/// to the preheader, if they are proven not to alias with any stores in the
|
|
/// plan using noalias metadata.
|
|
static void hoistInvariantLoads(VPlan &Plan);
|
|
|
|
/// Hoist predicated loads from the same address to the loop entry block, if
|
|
/// they are guaranteed to execute on both paths (i.e., in replicate regions
|
|
/// with complementary masks P and NOT P).
|
|
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE,
|
|
const Loop *L);
|
|
|
|
/// Sink predicated stores to the same address with complementary predicates
|
|
/// (P and NOT P) to an unconditional store with select recipes for the
|
|
/// stored values. This eliminates branching overhead when all paths
|
|
/// unconditionally store to the same location.
|
|
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE,
|
|
const Loop *L);
|
|
|
|
// Materialize vector trip counts for constants early if it can simply be
|
|
// computed as (Original TC / VF * UF) * VF * UF.
|
|
static void
|
|
materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
|
|
unsigned BestUF,
|
|
PredicatedScalarEvolution &PSE);
|
|
|
|
/// Materialize vector trip count computations to a set of VPInstructions.
|
|
/// \p Step is used as the step value for the trip count computation.
|
|
/// \p MaxRuntimeStep is the maximum possible runtime value of Step, used to
|
|
/// prove the trip count is divisible by the step for scalable VFs.
|
|
static void materializeVectorTripCount(
|
|
VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
|
|
bool RequiresScalarEpilogue, VPValue *Step,
|
|
std::optional<uint64_t> MaxRuntimeStep = std::nullopt);
|
|
|
|
/// Materialize the backedge-taken count to be computed explicitly using
|
|
/// VPInstructions.
|
|
static void materializeBackedgeTakenCount(VPlan &Plan,
|
|
VPBasicBlock *VectorPH);
|
|
|
|
/// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
|
|
/// into vectors and Unpack recipes to extract scalars from vectors as
|
|
/// needed.
|
|
static void materializePacksAndUnpacks(VPlan &Plan);
|
|
|
|
/// Materialize UF, VF and VFxUF to be computed explicitly using
|
|
/// VPInstructions.
|
|
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
|
|
ElementCount VF);
|
|
|
|
/// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
|
|
/// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
|
|
/// value. A mapping from SCEV expressions to their expanded IR value is
|
|
/// returned.
|
|
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
|
|
ScalarEvolution &SE);
|
|
|
|
/// Try to find a single VF among \p Plan's VFs for which all interleave
|
|
/// groups (with known minimum VF elements) can be replaced by wide loads and
|
|
/// stores processing VF elements, if all transformed interleave groups access
|
|
/// the full vector width (checked via the maximum vector register width). If
|
|
/// the transformation can be applied, the original \p Plan will be split in
|
|
/// 2:
|
|
/// 1. The original Plan with the single VF containing the optimized recipes
|
|
/// using wide loads instead of interleave groups.
|
|
/// 2. A new clone which contains all VFs of Plan except the optimized VF.
|
|
///
|
|
/// This effectively is a very simple form of loop-aware SLP, where we use
|
|
/// interleave groups to identify candidates.
|
|
static std::unique_ptr<VPlan>
|
|
narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
|
|
|
|
/// Adapts the vector loop region for tail folding by introducing a header
|
|
/// mask and conditionally executing the content of the region:
|
|
///
|
|
/// Vector loop region before:
|
|
/// +-------------------------------------------+
|
|
/// |%iv = ... |
|
|
/// |... |
|
|
/// |%iv.next = add %iv, vfxuf |
|
|
/// |branch-on-count %iv.next, vector-trip-count|
|
|
/// +-------------------------------------------+
|
|
///
|
|
/// Vector loop region after:
|
|
/// +-------------------------------------------+
|
|
/// |%iv = ... |
|
|
/// |%wide.iv = widen-canonical-iv ... |
|
|
/// |%header-mask = icmp ule %wide.iv, BTC |
|
|
/// |branch-on-cond %header-mask |---+
|
|
/// +-------------------------------------------+ |
|
|
/// | |
|
|
/// v |
|
|
/// +-------------------------------------------+ |
|
|
/// | ... | |
|
|
/// +-------------------------------------------+ |
|
|
/// | |
|
|
/// v |
|
|
/// +-------------------------------------------+ |
|
|
/// |<phis> = phi [..., ...], [poison, header] |
|
|
/// |%iv.next = add %iv, vfxuf |<--+
|
|
/// |branch-on-count %iv.next, vector-trip-count|
|
|
/// +-------------------------------------------+
|
|
///
|
|
/// Any VPInstruction::ExtractLastLanes are also updated to extract from the
|
|
/// last active lane of the header mask.
|
|
static void foldTailByMasking(VPlan &Plan);
|
|
|
|
/// Predicate and linearize the control-flow in the only loop region of
|
|
/// \p Plan.
|
|
static void introduceMasksAndLinearize(VPlan &Plan);
|
|
|
|
/// Replace a VPWidenCanonicalIVRecipe if it is present in \p Plan, with a
|
|
/// VPWidenIntOrFpInductionRecipe, provided it would not cause additional
|
|
/// spills for \p VF at unroll factor \p UF.
|
|
static void replaceWideCanonicalIVWithWideIV(
|
|
VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
|
|
TargetTransformInfo::TargetCostKind CostKind, ElementCount VF,
|
|
unsigned UF, const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
|
|
|
|
/// Add branch weight metadata, if the \p Plan's middle block is terminated by
|
|
/// a BranchOnCond recipe.
|
|
static void
|
|
addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
|
|
std::optional<unsigned> VScaleForTuning);
|
|
|
|
/// Adjust first-order recurrence users in the middle block: create
|
|
/// penultimate element extracts for LCSSA phi users, and handle penultimate
|
|
/// extracts of the last active lane edge.
|
|
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan,
|
|
VFRange &Range);
|
|
|
|
/// Optimize FindLast reductions selecting IVs (or expressions of IVs) by
|
|
/// converting them to FindIV reductions, if their IV range excludes a
|
|
/// suitable sentinel value. For expressions of IVs, the expression is sunk
|
|
/// to the middle block.
|
|
static void optimizeFindIVReductions(VPlan &Plan,
|
|
PredicatedScalarEvolution &PSE, Loop &L);
|
|
|
|
/// Detect and create partial reduction recipes for scaled reductions in
|
|
/// \p Plan. Must be called after recipe construction. If partial reductions
|
|
/// are only valid for a subset of VFs in Range, Range.End is updated.
|
|
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
|
|
VFRange &Range);
|
|
|
|
/// Convert load/store VPInstructions in \p Plan into widened or replicate
|
|
/// recipes. Non load/store input instructions are left unchanged.
|
|
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
|
|
VPRecipeBuilder &RecipeBuilder);
|
|
};
|
|
|
|
} // namespace llvm
|
|
|
|
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
|