//===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file implements VFSelectionContext methods for loop vectorization /// VF selection, independent of cost-modeling decisions. /// //===----------------------------------------------------------------------===// #include "LoopVectorizationPlanner.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; #define DEBUG_TYPE "loop-vectorize" static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop.")); static cl::opt UseWiderVFIfCallVariantsPresent( "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants")); static cl::opt ConsiderRegPressure( "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high.")); static cl::opt ForceTargetSupportsScalableVectors( "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc( "Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing.")); cl::opt llvm::PreferInLoopReductions( "prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); /// Note: This currently only applies to `llvm.masked.load` and /// `llvm.masked.store`. TODO: Extend this to cover other operations as needed. static cl::opt ForceTargetSupportsMaskedMemoryOps( "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing).")); bool VFSelectionContext::isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const { return Legal->isConsecutivePtr(DataType, Ptr) && (ForceTargetSupportsMaskedMemoryOps || TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace)); } bool VFSelectionContext::isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const { return Legal->isConsecutivePtr(DataType, Ptr) && (ForceTargetSupportsMaskedMemoryOps || TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace)); } bool VFSelectionContext::isLegalGatherOrScatter(Value *V, ElementCount VF) const { bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); if (VF.isVector()) Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } bool VFSelectionContext::supportsScalableVectors() const { return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors; } bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const { TargetTransformInfo::RegisterKind RegKind = IsScalable ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && (TTI.shouldMaximizeVectorBandwidth(RegKind) || (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants()))); } bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const { if (ConsiderRegPressure.getNumOccurrences()) return ConsiderRegPressure; // TODO: We should eventually consider register pressure for all targets. The // TTI hook is temporary whilst target-specific issues are being fixed. if (TTI.shouldConsiderVectorizationRegPressure()) return true; if (!useMaxBandwidth(VF.isScalable())) return false; // Only calculate register pressure for VFs enabled by MaxBandwidth. return ElementCount::isKnownGT( VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF : MaxPermissibleVFWithoutMaxBW.FixedVF); } ElementCount VFSelectionContext::clampVFByMaxTripCount( ElementCount VF, unsigned MaxTripCount, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue) const { unsigned EstimatedVF = VF.getKnownMinValue(); if (VF.isScalable() && F.hasFnAttribute(Attribute::VScaleRange)) { auto Attr = F.getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); EstimatedVF *= Min; } // When a scalar epilogue is required, at least one iteration of the scalar // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a // max VF that results in a dead vector loop. if (MaxTripCount > 0 && RequiresScalarEpilogue) MaxTripCount -= 1; // When the user specifies an interleave count, we need to ensure that // VF * UserIC <= MaxTripCount to avoid a dead vector loop. unsigned IC = UserIC > 0 ? UserIC : 1; unsigned EstimatedVFTimesIC = EstimatedVF * IC; if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC && (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { // If upper bound loop trip count (TC) is known at compile time there is no // point in choosing VF greater than TC / IC (as done in the loop below). // Select maximum power of two which doesn't exceed TC / IC. If VF is // scalable, we only fall back on a fixed VF when the TC is less than or // equal to the known number of lanes. auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC); if (ClampedUpperTripCount == 0) ClampedUpperTripCount = 1; LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count" << (UserIC > 0 ? " divided by UserIC" : "") << ": " << ClampedUpperTripCount << "\n"); return ElementCount::get(ClampedUpperTripCount, FoldTailByMasking ? VF.isScalable() : false); } return VF; } ElementCount VFSelectionContext::getMaximizedVFForTarget( unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector); // Convenience function to return the minimum of two ElementCounts. auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { assert((LHS.isScalable() == RHS.isScalable()) && "Scalable flags must match"); return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; }; // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorElementCount = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), ComputeScalableMaxVF); MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << (MaxVectorElementCount * WidestType) << " bits.\n"); if (!MaxVectorElementCount) { LLVM_DEBUG(dbgs() << "LV: The target has no " << (ComputeScalableMaxVF ? "scalable" : "fixed") << " vector registers.\n"); return ElementCount::getFixed(1); } ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking, RequiresScalarEpilogue); // If the MaxVF was already clamped, there's no point in trying to pick a // larger one. if (MaxVF != MaxVectorElementCount) return MaxVF; if (MaxVF.isScalable()) MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; else MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF; if (useMaxBandwidth(ComputeScalableMaxVF)) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); if (ElementCount MinVF = TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); MaxVF = MinVF; } } MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC, FoldTailByMasking, RequiresScalarEpilogue); } return MaxVF; } std::optional llvm::getMaxVScale(const Function &F, const TargetTransformInfo &TTI) { if (std::optional MaxVScale = TTI.getMaxVScale()) return MaxVScale; if (F.hasFnAttribute(Attribute::VScaleRange)) return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); return std::nullopt; } bool VFSelectionContext::isScalableVectorizationAllowed() { if (IsScalableVectorizationAllowed) return *IsScalableVectorizationAllowed; IsScalableVectorizationAllowed = false; if (!supportsScalableVectors()) return false; if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); return false; } LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits::max()); // Test that the loop-vectorizer can legalize all operations for this MaxVF. // FIXME: While for scalable vectors this is currently sufficient, this should // be replaced by a more detailed mechanism that filters out specific VFs, // instead of invalidating vectorization for a whole set of VFs based on the // MaxVF. // Disable scalable vectorization if the loop contains unsupported reductions. if (!all_of(Legal->getReductionVars(), [&](const auto &Reduction) -> bool { return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF); })) { reportVectorizationInfo( "Scalable vectorization not supported for the reduction " "operations found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); return false; } // Disable scalable vectorization if the loop contains any instructions // with element types not supported for scalable vectors. if (any_of(ElementTypesInLoop, [&](Type *Ty) { return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty); })) { reportVectorizationInfo("Scalable vectorization is not supported " "for all element types found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); return false; } if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) { reportVectorizationInfo("The target does not provide maximum vscale value " "for safe distance analysis.", "ScalableVFUnfeasible", ORE, TheLoop); return false; } IsScalableVectorizationAllowed = true; return true; } ElementCount VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) { if (!isScalableVectorizationAllowed()) return ElementCount::getScalable(0); auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits::max()); if (Legal->isSafeForAnyVectorWidth()) return MaxScalableVF; std::optional MaxVScale = getMaxVScale(F, TTI); // Limit MaxScalableVF by the maximum safe dependence distance. MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " "unfeasible.", "ScalableVFUnfeasible", ORE, TheLoop); return MaxScalableVF; } FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF( unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue) { auto [SmallestType, WidestType] = getSmallestAndWidestTypes(); // Get the maximum safe dependence distance in bits computed by LAA. // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeElementsPowerOf2 = llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); if (!Legal->isSafeForAnyStoreLoadForwardDistances()) { unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits(); MaxSafeElementsPowerOf2 = std::min(MaxSafeElementsPowerOf2, SLDist / WidestType); } auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2); if (!Legal->isSafeForAnyVectorWidth()) MaxSafeElements = MaxSafeElementsPowerOf2; LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF << ".\n"); // First analyze the UserVF, fall back if the UserVF should be ignored. if (UserVF) { auto MaxSafeUserVF = UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { // If `VF=vscale x N` is safe, then so is `VF=N` if (UserVF.isScalable()) return FixedScalableVFPair( ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); return UserVF; } assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it // is better to ignore the hint and let the compiler choose a suitable VF. if (!UserVF.isScalable()) { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is unsafe, clamping to max safe VF=" << MaxSafeFixedVF << ".\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeFixedVF); }); return MaxSafeFixedVF; } if (!supportsScalableVectors()) { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is ignored because scalable vectors are not " "available.\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is ignored because the target does not support scalable " "vectors. The compiler will pick a more suitable value."; }); } else { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is unsafe. Ignoring scalable UserVF.\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is unsafe. Ignoring the hint to let the compiler pick a " "more suitable value."; }); } } LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); if (auto MaxVF = getMaximizedVFForTarget( MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC, FoldTailByMasking, RequiresScalarEpilogue)) Result.FixedVF = MaxVF; if (auto MaxVF = getMaximizedVFForTarget( MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC, FoldTailByMasking, RequiresScalarEpilogue)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"); } return Result; } std::pair VFSelectionContext::getSmallestAndWidestTypes() const { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = F.getDataLayout(); // For in-loop reductions, no element types are added to ElementTypesInLoop // if there are no loads/stores in the loop. In this case, check through the // reduction variables to determine the maximum width. if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { for (const auto &[_, RdxDesc] : Legal->getReductionVars()) { // When finding the min width used by the recurrence we need to account // for casts on the input operands of the recurrence. MinWidth = std::min( MinWidth, std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), RdxDesc.getRecurrenceType()->getScalarSizeInBits())); MaxWidth = std::max(MaxWidth, RdxDesc.getRecurrenceType()->getScalarSizeInBits()); } } else { for (Type *T : ElementTypesInLoop) { MinWidth = std::min( MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); MaxWidth = std::max( MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); } } return {MinWidth, MaxWidth}; } void VFSelectionContext::collectElementTypesForWidening( const SmallPtrSetImpl *ValuesToIgnore) { ElementTypesInLoop.clear(); // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the loop. for (Instruction &I : *BB) { Type *T = I.getType(); // Skip ignored values. if (ValuesToIgnore && ValuesToIgnore->contains(&I)) continue; // Only examine Loads, Stores and PHINodes. if (!isa(I)) continue; // Examine PHI nodes that are reduction variables. Update the type to // account for the recurrence type. if (auto *PN = dyn_cast(&I)) { if (!Legal->isReductionVariable(PN)) continue; const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(PN); if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(), RdxDesc.getRecurrenceType())) continue; T = RdxDesc.getRecurrenceType(); } // Examine the stored values. if (auto *ST = dyn_cast(&I)) T = ST->getValueOperand()->getType(); assert(T->isSized() && "Expected the load/store/recurrence type to be sized"); ElementTypesInLoop.insert(T); } } } void VFSelectionContext::initializeVScaleForTuning() { if (!supportsScalableVectors()) return; if (F.hasFnAttribute(Attribute::VScaleRange)) { auto Attr = F.getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); auto Max = Attr.getVScaleRangeMax(); if (Max && Min == Max) { VScaleForTuning = Max; return; } } VScaleForTuning = TTI.getVScaleForTuning(); } bool VFSelectionContext::useOrderedReductions( const RecurrenceDescriptor &RdxDesc) const { return !Hints->allowReordering() && RdxDesc.isOrdered(); } bool VFSelectionContext::runtimeChecksRequired() { LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); Loop *L = const_cast(TheLoop); if (Legal->getRuntimePointerChecking()->Need) { reportVectorizationFailure( "Runtime ptr check is required with -Os/-Oz", "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, L); return true; } if (!PSE.getPredicate().isAlwaysTrue()) { reportVectorizationFailure( "Runtime SCEV check is required with -Os/-Oz", "runtime SCEV checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, L); return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { reportVectorizationFailure( "Runtime stride check for small trip count", "runtime stride == 1 checks needed. Enable vectorization of " "this loop without such check by compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, L); return true; } return false; } void VFSelectionContext::computeMinimalBitwidths() { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); } void VFSelectionContext::collectInLoopReductions() { // Avoid duplicating work finding in-loop reductions. if (!InLoopReductions.empty()) return; for (const auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Reduction.second; // Multi-use reductions (e.g., used in FindLastIV patterns) are handled // separately and should not be considered for in-loop reductions. if (RdxDesc.hasUsesOutsideReductionChain()) continue; // We don't collect reductions that are type promoted (yet). if (RdxDesc.getRecurrenceType() != Phi->getType()) continue; // In-loop AnyOf and FindIV reductions are not yet supported. RecurKind Kind = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) || RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) || RecurrenceDescriptor::isFindLastRecurrenceKind(Kind)) continue; // If the target would prefer this reduction to happen "in-loop", then we // want to record it as such. if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && !TTI.preferInLoopReduction(Kind, Phi->getType())) continue; // Check that we can correctly put the reductions into the loop, by // finding the chain of operations that leads from the phi to the loop // exit value. SmallVector ReductionOperations = RdxDesc.getReductionOpChain(Phi, const_cast(TheLoop)); bool InLoop = !ReductionOperations.empty(); if (InLoop) { InLoopReductions.insert(Phi); // Add the elements to InLoopReductionImmediateChains for cost modelling. Instruction *LastChain = Phi; for (auto *I : ReductionOperations) { InLoopReductionImmediateChains[I] = LastChain; LastChain = I; } } LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") << " reduction for phi: " << *Phi << "\n"); } }