//===---------------------- AMDGPUNextUseAnalysis.cpp ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the AMDGPUNextUseAnalysis pass, a machine-level analysis // that computes the distance from each instruction to the "nearest" next use of // every live virtual register. These distances guide register spilling // decisions by identifying which live values are furthest from their next use // and are therefore the best candidates to spill. // // The analysis is based on the Braun & Hack CC'09 paper "Register Spilling and // Live-Range Splitting for SSA-Form Programs." // // Key concepts: // // NextUseDistance A loop-depth-weighted instruction count representing // how far away a register's next use is. Distances // through deeper loops are scaled by fromLoopDepth() so // that uses inside hot loops appear closer. // // Inter-block Pre-computed shortest weighted distances between all // distances pairs of basic blocks, used to efficiently answer // cross-block next-use queries. Each intermediate block // is weighted by fromLoopDepth() applied once per loop // boundary crossing relative to the destination. // // Configuration flags (see Config struct in the header): // // CountPhis Count PHI instructions toward distance and block size. // ForwardOnly Restrict inter-block distances to forward-reachable // paths. // PreciseUseModeling Model PHI uses at their incoming edge block and filter // uses with intermediate redefinitions. // PromoteToPreheader Route loop-entry and inner-loop uses to the preheader. // // This file contains: // // - Command-line options for configuration and debug output // - LiveRegUse / JSON helpers // - AMDGPUNextUseAnalysisImpl (the main analysis implementation) // - Instruction ID assignment and block size computation // - CFG path pre-computation (reachability, loop depth, back-edges) // - Inter-block distance computation // - Per-register next-use distance queries and caching // - AMDGPUNextUseAnalysis (public facade, pimpl) // - Legacy and new pass manager wrappers // //===----------------------------------------------------------------------===// #include "AMDGPUNextUseAnalysis.h" #include "AMDGPU.h" #include "GCNRegPressure.h" #include "GCNSubtarget.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/InitializePasses.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/JSON.h" #include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" #include #include #include using namespace llvm; #define DEBUG_TYPE "amdgpu-next-use-analysis" //============================================================================== // Options etc //============================================================================== namespace { cl::opt DistanceCacheEnabled("amdgpu-next-use-analysis-distance-cache", cl::init(true), cl::Hidden, cl::desc("Enable live-reg-use distance cache")); cl::opt DumpNextUseDistanceAsJson("amdgpu-next-use-analysis-dump-distance-as-json", cl::Hidden); cl::opt DumpNextUseDistanceDefToUse( "amdgpu-next-use-analysis-dump-distance-def-to-use", cl::init(false), cl::Hidden); cl::opt DumpNextUseDistanceVerbose("amdgpu-next-use-analysis-dump-distance-verbose", cl::init(false), cl::Hidden); // 'graphics' and 'compute' modes arose due to initial competing implementations // of next-use analysis that emphasized different types of workloads. This // implementation is a compromise that combines aspects of both. Over time, the // hope is we will be able to remove some of these differences and settle on a // more unified implementation. cl::opt ConfigPresetOpt("amdgpu-next-use-analysis-config", cl::Hidden, cl::init("graphics"), cl::desc("Config preset: 'graphics' or 'compute'")); cl::opt ConfigCountPhisOpt( "amdgpu-next-use-analysis-count-phis", cl::Hidden, cl::desc("Count PHI instructions toward distance and block size")); cl::opt ConfigForwardOnlyOpt( "amdgpu-next-use-analysis-forward-only", cl::Hidden, cl::desc("Restrict inter-block distances to forward-reachable paths")); cl::opt ConfigPreciseUseModelingOpt( "amdgpu-next-use-analysis-precise-use-modeling", cl::Hidden, cl::desc("Model PHI uses via incoming edge block with loop-aware " "reachability filtering")); cl::opt ConfigPromoteToPreheaderOpt( "amdgpu-next-use-analysis-use-preheader-model", cl::Hidden, cl::desc("Promote loop-entry and inner-loop uses to the loop preheader")); } // namespace //============================================================================== // LiveRegUse - Represents a live register use with its distance. Used for // tracking and sorting register uses by distance. //============================================================================== namespace { using UseDistancePair = AMDGPUNextUseAnalysis::UseDistancePair; struct LiveRegUse : public UseDistancePair { // 'nullptr' indicates an unset/invalid state. LiveRegUse() : UseDistancePair(nullptr, 0) {} LiveRegUse(const MachineOperand *Use, NextUseDistance Dist) : UseDistancePair(Use, Dist) {} LiveRegUse(const UseDistancePair &P) : UseDistancePair(P) {} bool isUnset() const { return Use == nullptr; } Register getReg() const { return Use->getReg(); } unsigned getSubReg() const { return Use->getSubReg(); } LaneBitmask getLaneMask(const SIRegisterInfo *TRI) const { return TRI->getSubRegIndexLaneMask(Use->getSubReg()); } bool isCloserThan(const LiveRegUse &X) const { if (Dist < X.Dist) return true; if (Dist > X.Dist) return false; if (Use == X.Use) return false; // Ugh. When !CountPhis, PHIs and the first non-PHI instruction have id // 0. In this case, consider PHIs as less than the first non-PHI // instruction. const MachineInstr *ThisMI = Use->getParent(); const MachineInstr *XMI = X.Use->getParent(); const MachineBasicBlock *ThisMBB = ThisMI->getParent(); if (ThisMBB == XMI->getParent()) { if (ThisMI->isPHI() && !XMI->isPHI() && XMI == &(*ThisMBB->getFirstNonPHI())) return true; } // Ensure deterministic results return X.getReg() < getReg(); } void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr, const MachineRegisterInfo *MRI = nullptr) const { if (isUnset()) { OS << ""; return; } Dist.print(OS); OS << " [" << printReg(getReg(), TRI, getSubReg(), MRI) << "]"; } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } }; inline bool updateClosest(LiveRegUse &Closest, const LiveRegUse &X) { if (!Closest.Use || X.isCloserThan(Closest)) { Closest = X; return true; } return false; } inline bool updateFurthest(LiveRegUse &Furthest, const LiveRegUse &X) { if (!Furthest.Use || Furthest.isCloserThan(X)) { Furthest = X; return true; } return false; } } // namespace //============================================================================== // JSON helpers //============================================================================== namespace { template void printStringAttr(json::OStream &J, const char *Name, Lambda L) { J.attributeBegin(Name); raw_ostream &OS = J.rawValueBegin(); OS << '"'; L(OS); OS << '"'; J.rawValueEnd(); J.attributeEnd(); } void printStringAttr(json::OStream &J, const char *Name, Printable P) { printStringAttr(J, Name, [&](raw_ostream &OS) { OS << P; }); } void printStringAttr(json::OStream &J, const char *Name, const MachineInstr &MI, ModuleSlotTracker &MST) { printStringAttr(J, Name, [&](raw_ostream &OS) { MI.print(OS, MST, /* IsStandalone */ false, /* SkipOpers */ false, /* SkipDebugLoc */ false, /* AddNewLine ---> */ false, /* TargetInstrInfo */ nullptr); }); } void printMBBNameAttr(json::OStream &J, const char *Name, const MachineBasicBlock &MBB, ModuleSlotTracker &MST) { printStringAttr(J, Name, [&](raw_ostream &OS) { MBB.printName(OS, MachineBasicBlock::PrintNameIr, &MST); }); } template void printAttr(json::OStream &J, NameLambda NL, ValueT V) { std::string Name; raw_string_ostream NameOS(Name); NL(NameOS); J.attribute(NameOS.str(), V); } template void printAttr(json::OStream &J, const Printable &P, ValueT V) { printAttr(J, [&](raw_ostream &OS) { OS << P; }, V); } } // namespace //============================================================================== // AMDGPUNextUseAnalysisImpl //============================================================================== class llvm::AMDGPUNextUseAnalysisImpl { public: struct CacheableNextUseDistance { bool IsInstrRelative; NextUseDistance Distance; }; static constexpr bool InstrRelative = true; static constexpr bool InstrInvariant = false; private: const MachineFunction *MF = nullptr; const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; const MachineLoopInfo *MLI = nullptr; const MachineRegisterInfo *MRI = nullptr; using InstrIdTy = unsigned; using InstrToIdMap = DenseMap; InstrToIdMap InstrToId; AMDGPUNextUseAnalysis::Config Cfg; void initializeTables() { for (const MachineBasicBlock &BB : *MF) calcInstrIds(&BB, InstrToId); initializeCfgPaths(); initializeInterBlockDistances(); } void clearTables() { InstrToId.clear(); RegUseMap.clear(); Paths.clear(); resetDistanceCache(); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Instruction Ids //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: unsigned sizeOf(const MachineInstr &MI) const { // When !Cfg.CountPhis, PHIs do not contribute to distances/sizes since they // generally don't result in the generation of a machine instruction. // FIXME: Consider using MI.isPseudo() or maybe MI.isMetaInstruction(). return Cfg.CountPhis ? 1 : !MI.isPHI(); } void calcInstrIds(const MachineBasicBlock *BB, InstrToIdMap &MutableInstrToId) const { InstrIdTy Id = 0; for (auto &MI : BB->instrs()) { MutableInstrToId[&MI] = Id; Id += sizeOf(MI); } } /// Returns MI's instruction Id. It renumbers (part of) the BB if MI is not /// found in the map. InstrIdTy getInstrId(const MachineInstr *MI) const { auto It = InstrToId.find(MI); if (It != InstrToId.end()) return It->second; // Renumber the MBB. // TODO: Renumber from MI onwards. auto &MutableInstrToId = const_cast(InstrToId); calcInstrIds(MI->getParent(), MutableInstrToId); return InstrToId.find(MI)->second; } // Length of the segment from MI (inclusive) to the first instruction of the // basic block. InstrIdTy getHeadLen(const MachineInstr *MI) const { const MachineBasicBlock *MBB = MI->getParent(); return getInstrId(MI) + getInstrId(&MBB->instr_front()) + 1; } // Length of the segment from MI (exclusive) to the last instruction of the // basic block. InstrIdTy getTailLen(const MachineInstr *MI) const { const MachineBasicBlock *MBB = MI->getParent(); return getInstrId(&MBB->instr_back()) - getInstrId(MI); } // Length of the segment from 'From' to 'To' (exclusive). Both instructions // must be in the same basic block. InstrIdTy getDistance(const MachineInstr *From, const MachineInstr *To) const { assert(From->getParent() == To->getParent()); return getInstrId(To) - getInstrId(From); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // RegUses - cache of uses by register //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: DenseMap> RegUseMap; const SmallVector & getRegisterUses(Register Reg) const { auto I = RegUseMap.find(Reg); if (I != RegUseMap.end()) return I->second; auto *NonConstThis = const_cast(this); SmallVector &Uses = NonConstThis->RegUseMap[Reg]; for (const MachineOperand &UseMO : MRI->use_nodbg_operands(Reg)) { if (!UseMO.isUndef()) Uses.push_back(&UseMO); } return Uses; } bool hasAtLeastOneUse(Register Reg) const { return !getRegisterUses(Reg).empty(); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Paths //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: class Path { using StorageTy = std::pair; StorageTy P; public: constexpr Path() : P(nullptr, nullptr) {} constexpr Path(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) : P(Src, Dst) {} Path(const StorageTy &Pair) : P(Pair) {} constexpr operator const StorageTy &() const { return P; } using DenseMapInfo = llvm::DenseMapInfo; const MachineBasicBlock *src() const { return P.first; } const MachineBasicBlock *dst() const { return P.second; } }; enum class EdgeKind { Back = -1, None = 0, Forward = 1 }; static constexpr StringRef toString(EdgeKind EK) { if (EK == EdgeKind::Back) return "back"; if (EK == EdgeKind::Forward) return "fwd"; return "none"; } struct PathInfo { EdgeKind EK; bool Reachable; int ForwardReachable; unsigned RelativeLoopDepth; std::optional ShortestDistance; std::optional ShortestUnweightedDistance; InstrIdTy Size; PathInfo() : EK(EdgeKind::None), Reachable(false), ForwardReachable(-1), RelativeLoopDepth(0), Size(0) {} bool isBackedge() const { return EK == EdgeKind::Back; } bool isForwardReachableSet() const { return 0 <= ForwardReachable; } bool isForwardReachableUnset() const { return ForwardReachable < 0; } bool isForwardReachable() const { return ForwardReachable == 1; } bool isNotForwardReachable() const { return ForwardReachable == 0; } void print(raw_ostream &OS) const { OS << "{ek=" << toString(EK) << " reach=" << Reachable << " fwd-reach=" << ForwardReachable << " loop-depth=" << RelativeLoopDepth << " size=" << Size; if (ShortestDistance) { OS << " shortest-dist="; ShortestDistance->print(OS); } if (ShortestUnweightedDistance) { OS << " shortest-unweighted-dist="; ShortestUnweightedDistance->print(OS); } OS << "}"; } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } }; //---------------------------------------------------------------------------- // Path Storage - 'Paths' is lazily populated and some members are lazily // computed. All mutations should go through one of the 'initializePathInfo*' // flavors below. //---------------------------------------------------------------------------- DenseMap Paths; const PathInfo *maybePathInfoFor(const MachineBasicBlock *From, const MachineBasicBlock *To) const { auto I = Paths.find({From, To}); return I == Paths.end() ? nullptr : &I->second; } PathInfo &getOrInitPathInfo(const MachineBasicBlock *From, const MachineBasicBlock *To) const { auto *NonConstThis = const_cast(this); auto &MutablePaths = NonConstThis->Paths; Path P(From, To); auto [I, Inserted] = MutablePaths.try_emplace(P); if (!Inserted) return I->second; bool Reachable = calcIsReachable(P.src(), P.dst()); // Iterator may have been invalidated by calcIsReachable, so get a fresh // reference to the slot. return NonConstThis->initializePathInfo(MutablePaths.at(P), P, EdgeKind::None, Reachable); } const PathInfo &pathInfoFor(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return getOrInitPathInfo(From, To); } //---------------------------------------------------------------------------- // initializePathInfo* - various flavors of PathInfo initialization. They // (should) always funnel to the first flavor below. //---------------------------------------------------------------------------- PathInfo &initializePathInfo(PathInfo &Slot, Path P, EdgeKind EK, bool Reachable) { Slot.EK = EK; Slot.Reachable = Reachable; Slot.ForwardReachable = EK == EdgeKind::None ? -1 : EK == EdgeKind::Forward; Slot.RelativeLoopDepth = Slot.Reachable ? calcRelativeLoopDepth(P.src(), P.dst()) : 0; Slot.Size = P.src() == P.dst() ? calcSize(P.src()) : 0; if (EK != EdgeKind::None) Slot.ShortestUnweightedDistance = 0; return Slot; } PathInfo &initializePathInfo(Path P, EdgeKind EK, bool Reachable) const { auto *NonConstThis = const_cast(this); auto &MutablePaths = NonConstThis->Paths; return NonConstThis->initializePathInfo(MutablePaths[P], P, EK, Reachable); } std::pair maybeInitializePathInfo(Path P, EdgeKind EK, bool Reachable) const { auto *NonConstThis = const_cast(this); auto &MutablePaths = NonConstThis->Paths; auto [I, Inserted] = MutablePaths.try_emplace(P); if (Inserted) NonConstThis->initializePathInfo(I->second, P, EK, Reachable); return {&I->second, Inserted}; } bool initializePathInfoForwardReachable(const MachineBasicBlock *From, const MachineBasicBlock *To, bool Value) const { PathInfo &Slot = getOrInitPathInfo(From, To); assert(Slot.isForwardReachableUnset()); Slot.ForwardReachable = Value; return Value; } NextUseDistance initializePathInfoShortestDistance(const MachineBasicBlock *From, const MachineBasicBlock *To, NextUseDistance Value) const { PathInfo &Slot = getOrInitPathInfo(From, To); assert(!Slot.ShortestDistance.has_value()); Slot.ShortestDistance = Value; return Value; } NextUseDistance initializePathInfoShortestUnweightedDistance(const MachineBasicBlock *From, const MachineBasicBlock *To, NextUseDistance Value) const { PathInfo &Slot = getOrInitPathInfo(From, To); assert(!Slot.ShortestUnweightedDistance.has_value()); Slot.ShortestUnweightedDistance = Value; return Value; } //---------------------------------------------------------------------------- // initialize*Paths //---------------------------------------------------------------------------- private: void initializePaths(const SmallVector &ReachablePaths, const SmallVector &UnreachablePaths) const { for (const Path &P : ReachablePaths) initializePathInfo(P, EdgeKind::None, true); for (const Path &P : UnreachablePaths) initializePathInfo(P, EdgeKind::None, false); } void initializeForwardOnlyPaths(const SmallVector &ReachablePaths, const SmallVector &UnreachablePaths) const { for (bool R : {true, false}) { const auto &ToInit = R ? ReachablePaths : UnreachablePaths; for (const Path &P : ToInit) { PathInfo &Slot = getOrInitPathInfo(P.src(), P.dst()); assert(Slot.isForwardReachableUnset() || Slot.ForwardReachable == R); Slot.ForwardReachable = R; } } } // Follow the control flow graph starting at the entry block until all blocks // have been visited. Along the way, initialize the PathInfo for each edge // traversed. void initializeCfgPaths() { Paths.clear(); enum VisitState { Undiscovered, Visiting, Finished }; DenseMap State; SmallVector Work{&MF->front()}; State[&MF->front()] = Undiscovered; while (!Work.empty()) { const MachineBasicBlock *Src = Work.back(); VisitState &SrcState = State[Src]; // A block may already be 'Finished' if it is reachable from multiple // predecessors causing it to be pushed more than once while still // 'Undiscovered'. if (SrcState == Visiting || SrcState == Finished) { Work.pop_back(); SrcState = Finished; continue; } SrcState = Visiting; for (const MachineBasicBlock *Dst : Src->successors()) { const VisitState DstState = State.lookup(Dst); EdgeKind EK; if (DstState == Undiscovered) { EK = EdgeKind::Forward; Work.push_back(Dst); } else if (DstState == Visiting) { EK = EdgeKind::Back; } else { EK = EdgeKind::Forward; } Path P(Src, Dst); assert(!Paths.contains(P)); initializePathInfo(P, EK, /*Reachable*/ true); } } LLVM_DEBUG(dumpPaths()); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Loop helpers //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: static bool isStandAloneLoop(const MachineLoop *Loop) { return Loop->getSubLoops().empty() && Loop->isOutermost(); } static MachineLoop *findChildLoop(MachineLoop *const Parent, MachineLoop *Descendant) { for (MachineLoop *L = Descendant; L != Parent; L = L->getParentLoop()) { if (L->getParentLoop() == Parent) return L; } return nullptr; } // If loops 'A' and 'B' share a common parent loop, return that loop and the // depth of 'A' relative to it. Otherwise return nullptr and the loop depth of // 'A'. static std::pair findCommonParent(MachineLoop *A, const MachineLoop *B) { unsigned Depth = 0; for (; A != nullptr; A = A->getParentLoop(), ++Depth) { if (A->contains(B)) break; } return {A, Depth}; } static const MachineBasicBlock * getOutermostPreheader(const MachineLoop *Loop) { return Loop ? Loop->getOutermostLoop()->getLoopPreheader() : nullptr; } static MachineBasicBlock *findChildPreheader(MachineLoop *const Parent, MachineLoop *Descendant) { MachineLoop *ChildLoop = findChildLoop(Parent, Descendant); return ChildLoop ? ChildLoop->getLoopPreheader() : nullptr; } static const MachineBasicBlock * getIncomingBlockIfPhiUse(const MachineInstr *MI, const MachineOperand *MO) { return MI->isPHI() ? MI->getOperand(MO->getOperandNo() + 1).getMBB() : nullptr; } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Calculate features //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: InstrIdTy calcSize(const MachineBasicBlock *BB) const { InstrIdTy Size = BB->size(); if (!Cfg.CountPhis) Size -= std::distance(BB->begin(), BB->getFirstNonPHI()); return Size; } NextUseDistance calcWeightedSize(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return NextUseDistance::fromSize(getSize(From), getRelativeLoopDepth(From, To)); } // Return the loop depth of 'From' relative to 'To'. unsigned calcRelativeLoopDepth(const MachineBasicBlock *From, const MachineBasicBlock *To) const { MachineLoop *LoopFrom = MLI->getLoopFor(From); MachineLoop *LoopTo = MLI->getLoopFor(To); if (!LoopFrom) return 0; if (!LoopTo) return LoopFrom->getLoopDepth(); if (LoopFrom->contains(LoopTo)) // covers LoopFrom == LoopTo return 0; if (LoopTo->contains(LoopFrom)) return LoopFrom->getLoopDepth() - LoopTo->getLoopDepth(); // Loops are siblings of some sort. return findCommonParent(LoopFrom, LoopTo).second; } // Attempt to find a path from 'From' to 'To' using a depth first search. If // 'ForwardOnly' is true, do not follow backedges. As a performance // improvement, this may initialize reachable intermediate paths or paths we // determine are unreachable. bool calcIsReachable(const MachineBasicBlock *From, const MachineBasicBlock *To, bool ForwardOnly = false) const { if (From == To && !MLI->getLoopFor(From)) return false; if (!ForwardOnly && interBlockDistanceExists(From, To)) return true; enum { VisitOp, PopOp }; using MBBOpPair = std::pair; SmallVector Work{{From, VisitOp}}; DenseSet Visited{From}; SmallVector IntermediatePath; SmallVector Unreachable; // Should be run at every function exit point. auto Finally = [&](bool Reachable) { // This is an optimization. For intermediate paths we found while // calculating reachability for 'From' --> 'To', remember their // reachability. if (!Reachable) { IntermediatePath.clear(); for (const MachineBasicBlock *MBB : Visited) { if (MBB != From) Unreachable.emplace_back(MBB, To); } } if (ForwardOnly) initializeForwardOnlyPaths(IntermediatePath, Unreachable); else initializePaths(IntermediatePath, Unreachable); return Reachable; }; while (!Work.empty()) { auto [Current, Op] = Work.pop_back_val(); // Backtracking if (Op == PopOp) { IntermediatePath.pop_back(); if (ForwardOnly) Unreachable.emplace_back(Current, To); continue; } if (Current->succ_empty()) continue; if (Current != From) { IntermediatePath.emplace_back(Current, To); Work.emplace_back(Current, PopOp); } for (const MachineBasicBlock *Succ : Current->successors()) { if (ForwardOnly && isBackedge(Current, Succ)) continue; if (Succ == To) return Finally(true); if (auto CachedReachable = isMaybeReachable(Succ, To, ForwardOnly)) { if (CachedReachable.value()) return Finally(true); Visited.insert(Succ); continue; } if (Visited.insert(Succ).second) Work.emplace_back(Succ, VisitOp); } } return Finally(false); } //---------------------------------------------------------------------------- // Inter-block distance - the weighted and unweighted cost (i.e. "distance") // to travel from one MachineBasicBlock to another. // // Values are pre-computed and stored in 'InterBlockDistances' using a // backwards data-flow algorithm similar to the one described in 4.1 of a // "Register Spilling and Live-Range Splitting for SSA-Form Programs" by // Matthias Braun and Sebastian Hack, CC'09. This replaced a prior // implementation based on Dijkstra's shortest path algorithm. //---------------------------------------------------------------------------- private: struct InterBlockDistance { NextUseDistance Weighted; NextUseDistance Unweighted; InterBlockDistance() : Weighted(-1), Unweighted(-1) {} InterBlockDistance(NextUseDistance W, NextUseDistance UW) : Weighted(W), Unweighted(UW) {} bool operator==(const InterBlockDistance &Other) const { return Weighted == Other.Weighted && Unweighted == Other.Unweighted; } bool operator!=(const InterBlockDistance &Other) const { return !(*this == Other); } void print(raw_ostream &OS) const { OS << "{W="; Weighted.print(OS); OS << " U="; Unweighted.print(OS); OS << "}"; } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } }; using InterBlockDistanceMap = DenseMap>; InterBlockDistanceMap InterBlockDistances; void initializeInterBlockDistances() { InterBlockDistanceMap Distances; bool Changed; do { Changed = false; for (const MachineBasicBlock *MBB : post_order(MF)) { unsigned MBBNum = MBB->getNumber(); // Save previous state for convergence check InterBlockDistanceMap::mapped_type Prev = std::move(Distances[MBBNum]); InterBlockDistanceMap::mapped_type Curr; Curr.reserve(Prev.size()); // Direct successors are distance 0 by definition: no instructions are // executed between exiting MBB and entering Succ. for (const MachineBasicBlock *Succ : MBB->successors()) Curr[Succ->getNumber()] = InterBlockDistance(0, 0); // Propagate further destinations through each successor. for (const MachineBasicBlock *Succ : MBB->successors()) { unsigned SuccNum = Succ->getNumber(); const unsigned UnweightedSize{getSize(Succ)}; for (const auto &[DestBlockNum, DestDist] : Distances[SuccNum]) { // MBB -> MBB is considered unreachable (getInterBlockDistance // asserts From != To). if (DestBlockNum == MBBNum) continue; const MachineBasicBlock *DestMBB = MF->getBlockNumbered(DestBlockNum); const NextUseDistance UnweightedDist{UnweightedSize + DestDist.Unweighted}; unsigned SuccToDestLoopDepth = calcRelativeLoopDepth(Succ, DestMBB); const NextUseDistance WeightedDist = DestDist.Weighted + NextUseDistance::fromSize(UnweightedSize, SuccToDestLoopDepth); // Insert or update distances (take minimum) auto [I, First] = Curr.try_emplace(DestBlockNum, WeightedDist, UnweightedDist); if (!First) { InterBlockDistance &Slot = I->second; Slot.Weighted = min(Slot.Weighted, WeightedDist); Slot.Unweighted = min(Slot.Unweighted, UnweightedDist); } } } Changed |= (Prev != Curr); Distances[MBBNum] = std::move(Curr); } } while (Changed); InterBlockDistances = std::move(Distances); LLVM_DEBUG(dumpInterBlockDistances()); } const InterBlockDistance * getInterBlockDistanceMapValue(const MachineBasicBlock *From, const MachineBasicBlock *To) const { auto I = InterBlockDistances.find(From->getNumber()); if (I == InterBlockDistances.end()) return nullptr; const InterBlockDistanceMap::mapped_type &FromSlot = I->second; auto J = FromSlot.find(To->getNumber()); return J == FromSlot.end() ? nullptr : &J->second; } bool interBlockDistanceExists(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return getInterBlockDistanceMapValue(From, To); } NextUseDistance getInterBlockDistance(const MachineBasicBlock *From, const MachineBasicBlock *To, bool Unweighted) const { assert(From != To && "The basic blocks should be different."); if (!From || !To) return NextUseDistance::unreachable(); if (Cfg.ForwardOnly && !isForwardReachable(From, To)) return NextUseDistance::unreachable(); const InterBlockDistance *BD = getInterBlockDistanceMapValue(From, To); if (!BD) return NextUseDistance::unreachable(); return Unweighted ? BD->Unweighted : BD->Weighted; } NextUseDistance getWeightedInterBlockDistance(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return getInterBlockDistance(From, To, false); } NextUseDistance getUnweightedInterBlockDistance(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return getInterBlockDistance(From, To, true); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Feature getters. Use cached results if available. If not calculate. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: InstrIdTy getSize(const MachineBasicBlock *BB) const { return pathInfoFor(BB, BB).Size; } bool isReachable(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return pathInfoFor(From, To).Reachable; } bool isReachableOrSame(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return From == To || pathInfoFor(From, To).Reachable; } bool isForwardReachable(const MachineBasicBlock *From, const MachineBasicBlock *To) const { const PathInfo &PI = pathInfoFor(From, To); if (PI.isForwardReachableSet()) return PI.isForwardReachable(); return initializePathInfoForwardReachable( From, To, PI.Reachable && calcIsReachable(From, To, /*ForwardOnly*/ true)); } // Return true/false if we know that 'To' is reachable or not from // 'From'. Otherwise return 'std::nullopt'. std::optional isMaybeReachable(const MachineBasicBlock *From, const MachineBasicBlock *To, bool ForwardOnly) const { const PathInfo *PI = maybePathInfoFor(From, To); if (!PI) return std::nullopt; if (ForwardOnly) { if (PI->isForwardReachable()) return true; if (PI->isNotForwardReachable()) return false; return std::nullopt; } return PI->Reachable; } bool isBackedge(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return pathInfoFor(From, To).isBackedge(); } // Can be used as a substitute for DT->dominates(A, B) if A and B are in the // same basic block. bool instrsAreInOrder(const MachineInstr *A, const MachineInstr *B) const { assert(A->getParent() == B->getParent() && "instructions must be in the same basic block!"); if (A == B || getInstrId(A) < getInstrId(B)) return true; if (!A->isPHI()) return false; if (!B->isPHI()) return true; for (auto &PHI : A->getParent()->phis()) { if (&PHI == A) return true; if (&PHI == B) return false; } return false; } unsigned getRelativeLoopDepth(const MachineBasicBlock *From, const MachineBasicBlock *To) const { return pathInfoFor(From, To).RelativeLoopDepth; } NextUseDistance getShortestPath(const MachineBasicBlock *From, const MachineBasicBlock *To) const { std::optional MaybeD = pathInfoFor(From, To).ShortestDistance; if (MaybeD.has_value()) return MaybeD.value(); NextUseDistance Dist = getWeightedInterBlockDistance(From, To); return initializePathInfoShortestDistance(From, To, Dist); } NextUseDistance getShortestUnweightedPath(const MachineBasicBlock *From, const MachineBasicBlock *To) const { std::optional MaybeD = pathInfoFor(From, To).ShortestUnweightedDistance; if (MaybeD.has_value()) return MaybeD.value(); return initializePathInfoShortestUnweightedDistance( From, To, getUnweightedInterBlockDistance(From, To)); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// MBBDistPair - Represents the distance to a machine basic block. /// Used for returning both the distance and the target block together. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: struct MBBDistPair { NextUseDistance Distance; const MachineBasicBlock *MBB; MBBDistPair() : Distance(NextUseDistance::unreachable()), MBB(nullptr) {} MBBDistPair(NextUseDistance D, const MachineBasicBlock *B) : Distance(D), MBB(B) {} MBBDistPair operator+(NextUseDistance D) { return {Distance + D, MBB}; } MBBDistPair &operator+=(NextUseDistance D) { Distance += D; return *this; } void print(raw_ostream &OS) const { OS << "{"; Distance.print(OS); if (MBB) OS << " " << printMBBReference(*MBB); else OS << " "; OS << "}"; } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } }; //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // CFG Helpers //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: // Return the shortest distance to a latch MBBDistPair calcShortestDistanceToLatch(const MachineBasicBlock *CurMBB, const MachineLoop *CurLoop) const { SmallVector Latches; CurLoop->getLoopLatches(Latches); MBBDistPair LD; for (MachineBasicBlock *LMBB : Latches) { if (LMBB == CurMBB) return {0, CurMBB}; NextUseDistance Dst = getShortestPath(CurMBB, LMBB); if (Dst < LD.Distance) { LD.Distance = Dst; LD.MBB = LMBB; } } return LD; } // Return the shortest unweighted distance to a latch MBBDistPair calcShortestUnweightedDistanceToLatch(const MachineBasicBlock *CurMBB, const MachineLoop *CurLoop) const { SmallVector Latches; CurLoop->getLoopLatches(Latches); MBBDistPair LD; for (MachineBasicBlock *LMBB : Latches) { if (LMBB == CurMBB) return {0, CurMBB}; NextUseDistance Dst = getShortestUnweightedPath(CurMBB, LMBB); if (Dst < LD.Distance) { LD.Distance = Dst; LD.MBB = LMBB; } } return LD; } // Return the shortest distance to an exit MBBDistPair calcShortestDistanceToExit(const MachineBasicBlock *CurMBB, const MachineLoop *CurLoop) const { SmallVector> ExitEdges; CurLoop->getExitEdges(ExitEdges); MBBDistPair LD; for (auto [Exit, Dest] : ExitEdges) { if (Exit == CurMBB) return {0, CurMBB}; NextUseDistance Dst = getShortestPath(CurMBB, Exit); if (Dst < LD.Distance) { LD.Distance = Dst; LD.MBB = Exit; } } return LD; } // Return the shortest distance through a loop (header to latch) that goes // through CurMBB. MBBDistPair calcShortestDistanceThroughInnermostLoop(const MachineBasicBlock *CurMBB, MachineLoop *CurLoop) const { assert(MLI->getLoopFor(CurMBB) == CurLoop); // This is a hot spot. Check it before doing anything else. if (CurLoop->getNumBlocks() == 1) return {getSize(CurMBB), CurMBB}; MachineBasicBlock *LoopHeader = CurLoop->getHeader(); MBBDistPair LD{0, nullptr}; LD += getSize(LoopHeader); if (CurMBB != LoopHeader) LD += getShortestPath(LoopHeader, CurMBB); if (CurLoop->isLoopExiting(CurMBB)) LD.MBB = CurMBB; else LD = calcShortestDistanceToExit(CurMBB, CurLoop) + LD.Distance; if (CurMBB != LoopHeader && CurMBB != LD.MBB) LD += getSize(CurMBB); if (LD.MBB != LoopHeader) LD += getSize(LD.MBB); return LD; } // Return the shortest distance through a loop (header to latch) that goes // through CurMBB. MBBDistPair calcShortestDistanceThroughLoop(const MachineBasicBlock *CurMBB, MachineLoop *OuterLoop) const { MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); MBBDistPair CurLD = calcShortestDistanceThroughInnermostLoop(CurMBB, CurLoop); MachineBasicBlock *CurHdr = CurLoop->getHeader(); for (;;) { if (OuterLoop == CurLoop) return CurLD; MachineLoop *ParentLoop = CurLoop->getParentLoop(); MachineBasicBlock *ParentHdr = ParentLoop->getHeader(); MBBDistPair LD{0, nullptr}; LD += getSize(ParentHdr); LD += getShortestPath(ParentHdr, CurHdr); LD += CurLD.Distance.applyLoopWeight(); LD = calcShortestDistanceToExit(CurLD.MBB, ParentLoop) + LD.Distance; LD += getSize(LD.MBB); CurLD = LD; CurLoop = ParentLoop; CurHdr = ParentHdr; } llvm_unreachable("CurMBB not contained in OuterLoop"); } // Similar to calcShortestDistanceThroughLoop with LoopWeight applied to the // returned distance. MBBDistPair calcWeightedDistanceThroughLoopViaMBB(const MachineBasicBlock *CurMBB, MachineLoop *CurLoop) const { MBBDistPair LD = calcShortestDistanceThroughLoop(CurMBB, CurLoop); LD.Distance = LD.Distance.applyLoopWeight(); return LD; } // Return the weighted, shortest distance through a loop (header to latch). // If ParentLoop is provided, use it to adjust the loop depth. MBBDistPair calcWeightedDistanceThroughLoop( const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineLoop *ParentLoop = nullptr) const { if (CurLoop->getNumBlocks() != 1) return calcWeightedDistanceThroughLoopViaMBB(CurMBB, CurLoop); unsigned LoopDepth = MLI->getLoopDepth(CurMBB); if (ParentLoop) LoopDepth -= ParentLoop->getLoopDepth(); return {NextUseDistance::fromSize(getSize(CurMBB), LoopDepth), CurLoop->getLoopLatch()}; } // Calculate total distance from exit point to use instruction NextUseDistance appendDistanceToUse(const MBBDistPair &Exit, const MachineInstr *UseMI, const MachineBasicBlock *UseMBB) const { return Exit.Distance + getShortestPath(Exit.MBB, UseMBB) + getHeadLen(UseMI); } // Return the weighted, shortest distance through the CurLoop which is a // sub-loop of UseLoop. MBBDistPair calcDistanceThroughSubLoopUse(const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, MachineLoop *UseLoop) const { // All the sub-loops of the UseLoop will be executed before the use. // Hence, we should take this into consideration in distance calculation. MachineLoop *UseLoopSubLoop = findChildLoop(UseLoop, CurLoop); assert(UseLoopSubLoop && "CurLoop should be nested in UseLoop"); return calcWeightedDistanceThroughLoop(CurMBB, UseLoopSubLoop, UseLoop); } // Similar to calcDistanceThroughSubLoopUse, adding the distance to 'UseMI'. NextUseDistance calcDistanceThroughSubLoopToUseMI( const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineInstr *UseMI, const MachineBasicBlock *UseMBB, MachineLoop *UseLoop) const { return appendDistanceToUse( calcDistanceThroughSubLoopUse(CurMBB, CurLoop, UseLoop), UseMI, UseMBB); } // Return the weighted distance through a loop to an outside use loop. // Differentiates between uses inside or outside of the current loop nest. MBBDistPair calcDistanceThroughLoopToOutsideLoopUse( const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineBasicBlock *UseMBB, MachineLoop *UseLoop) const { assert(!CurLoop->contains(UseLoop)); if (isStandAloneLoop(CurLoop)) return calcWeightedDistanceThroughLoopViaMBB(CurMBB, CurLoop); MachineLoop *OutermostLoop = CurLoop->getOutermostLoop(); if (!OutermostLoop->contains(UseLoop)) { // We should take into consideration the whole loop nest in the // calculation of the distance because we will reach the use after // executing the whole loop nest. // ... But make sure that we pick a route that goes through CurMBB return calcWeightedDistanceThroughLoopViaMBB(CurMBB, OutermostLoop); } // At this point we know that CurLoop and UseLoop are independent and they // are in the same loop nest. if (MLI->getLoopDepth(CurMBB) <= MLI->getLoopDepth(UseMBB)) return calcWeightedDistanceThroughLoop(CurMBB, CurLoop); assert(CurLoop != OutermostLoop && "The loop cannot be the outermost."); const unsigned UseLoopDepth = MLI->getLoopDepth(UseMBB); for (;;) { if (CurLoop->getLoopDepth() == UseLoopDepth) break; CurLoop = CurLoop->getParentLoop(); if (CurLoop == OutermostLoop) break; } return calcWeightedDistanceThroughLoop(CurMBB, CurLoop); } // Similar to calcDistanceThroughLoopToOutsideLoopUse but adds the distance to // an instruction in the loop. NextUseDistance calcDistanceThroughLoopToOutsideLoopUseMI( const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineInstr *UseMI, const MachineBasicBlock *UseMBB, MachineLoop *UseLoop) const { return appendDistanceToUse(calcDistanceThroughLoopToOutsideLoopUse( CurMBB, CurLoop, UseMBB, UseLoop), UseMI, UseMBB); } // Return true if 'MO' is covered by 'LaneMask' bool machineOperandCoveredBy(const MachineOperand &MO, LaneBitmask LaneMask) const { LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); return (Mask & LaneMask) == Mask; } // Returns true iff uses of LiveReg/LiveLaneMask in PHI UseMI are coming from // a backedge when starting at CurMI. bool isIncomingValFromBackedge(Register LiveReg, LaneBitmask LiveLaneMask, const MachineInstr *CurMI, const MachineInstr *UseMI) const { if (!UseMI->isPHI()) return false; MachineLoop *CurLoop = MLI->getLoopFor(CurMI->getParent()); MachineLoop *UseLoop = MLI->getLoopFor(UseMI->getParent()); // Not a backedge if ... // A: not in a loop at all // B: or CurMI is in a loop outside of UseLoop // C: or UseMI is not in the UseLoop header if (/*A:*/ !UseLoop || /*B:*/ (CurLoop && !UseLoop->contains(CurLoop)) || /*C:*/ UseMI->getParent() != UseLoop->getHeader()) return false; SmallVector Latches; UseLoop->getLoopLatches(Latches); const unsigned NumOps = UseMI->getNumOperands(); for (unsigned I = 1; I < NumOps; I += 2) { const MachineOperand &RegMO = UseMI->getOperand(I - 1); const MachineOperand &MBBMO = UseMI->getOperand(I); assert(RegMO.isReg() && "Expected register operand of PHI"); assert(MBBMO.isMBB() && "Expected MBB operand of PHI"); if (RegMO.getReg() == LiveReg && machineOperandCoveredBy(RegMO, LiveLaneMask)) { MachineBasicBlock *IncomingBB = MBBMO.getMBB(); if (llvm::is_contained(Latches, IncomingBB)) return true; } } return false; } // Return the distance from 'CurMI' through a parent loop backedge PHI Use // ('UseMI'). CacheableNextUseDistance calcDistanceViaEnclosingBackedge( const MachineInstr *CurMI, const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineInstr *UseMI, const MachineBasicBlock *UseMBB, MachineLoop *UseLoop) const { assert(UseLoop && "There is no backedge."); assert(CurLoop && (UseLoop != CurLoop) && UseLoop->contains(CurLoop) && "Unexpected loop configuration"); InstrIdTy UseHeadLen = getHeadLen(UseMI); MBBDistPair InnerLoopLD = calcDistanceThroughSubLoopUse(CurMBB, CurLoop, UseLoop); MBBDistPair LD = calcShortestDistanceToLatch(InnerLoopLD.MBB, UseLoop); return {InstrInvariant, InnerLoopLD.Distance + LD.Distance + getSize(LD.MBB) + UseHeadLen}; } // Optimized version of calcBackedgeDistance when we already know that CurMI // and UseMI are in the same basic block NextUseDistance calcBackedgeDistance(const MachineInstr *CurMI, const MachineBasicBlock *CurMBB, MachineLoop *CurLoop, const MachineInstr *UseMI) const { // use is in the next loop iteration InstrIdTy CurTailLen = getTailLen(CurMI); InstrIdTy UseHeadLen = getHeadLen(UseMI); MBBDistPair LD = calcShortestUnweightedDistanceToLatch(CurMBB, CurLoop); const MachineBasicBlock *HdrMBB = CurLoop->getHeader(); NextUseDistance Hdr = CurMBB == HdrMBB ? 0 : getSize(HdrMBB); NextUseDistance Dst = CurMBB == HdrMBB ? 0 : getShortestUnweightedPath(HdrMBB, CurMBB); return CurTailLen + LD.Distance + getSize(LD.MBB) + Hdr + Dst + UseHeadLen; } //---------------------------------------------------------------------------- // Calculate inter-instruction distances //---------------------------------------------------------------------------- private: // Calculate the shortest weighted path from MachineInstruction 'FromMI' to // 'ToMI'. It is weighted distance in that paths that exit loops are made to // look much further away. NextUseDistance calcShortestDistance(const MachineInstr *FromMI, const MachineInstr *ToMI) const { const MachineBasicBlock *FromMBB = FromMI->getParent(); const MachineBasicBlock *ToMBB = ToMI->getParent(); if (FromMBB == ToMBB) { NextUseDistance RV = getDistance(FromMI, ToMI); assert(RV >= 0 && "unexpected negative distance from getDistance"); return RV; } InstrIdTy FromTailLen = getTailLen(FromMI); InstrIdTy ToHeadLen = getHeadLen(ToMI); NextUseDistance Dst = getShortestPath(FromMBB, ToMBB); assert(Dst.isReachable() && "calcShortestDistance called for instructions in non-reachable" " basic blocks!"); NextUseDistance RV = FromTailLen + Dst + ToHeadLen; assert(RV >= 0 && "unexpected negative distance"); return RV; } // Calculate the shortest unweighted path from MachineInstruction 'FromMI' to // 'ToMI'. In contrast with 'calcShortestDistance', distances are based solely // on basic block instruction counts and traversing a loop exit does not // affect the value. NextUseDistance calcShortestUnweightedDistance(const MachineInstr *FromMI, const MachineInstr *ToMI) const { const MachineBasicBlock *FromMBB = FromMI->getParent(); const MachineBasicBlock *ToMBB = ToMI->getParent(); if (FromMBB == ToMBB) return getDistance(FromMI, ToMI); InstrIdTy FromTailLen = getTailLen(FromMI); InstrIdTy ToHeadLen = getHeadLen(ToMI); NextUseDistance Dst = getShortestUnweightedPath(FromMBB, ToMBB); assert(Dst.isReachable() && "calcShortestUnweightedDistance called for instructions in" " non-reachable basic blocks!"); return FromTailLen + Dst + ToHeadLen; } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // calcDistanceToUse* - various flavors of calculating the distance from an // instruction 'CurMI' to the use of a live [sub]register. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: // Return the distance from 'CurMI' to a live [sub]register use ('UseMI'). // // Cfg flags controlling behavior: // PreciseUseModeling — rewrite PHI uses to their incoming edge block; // also selects unweighted cross-block distance // PromoteToPreheader — route loop-entry / inner-loop uses to the preheader CacheableNextUseDistance calcDistanceToUse(Register LiveReg, LaneBitmask LiveLaneMask, const MachineInstr &CurMI, const MachineOperand *UseMO) const { const MachineInstr *UseMI = UseMO->getParent(); const MachineBasicBlock *CurMBB = CurMI.getParent(); const MachineBasicBlock *UseMBB = UseMI->getParent(); MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); if (Cfg.PreciseUseModeling) { // Map PHI use to the end of its incoming edge block. if (auto *PhiUseEdge = getIncomingBlockIfPhiUse(UseMI, UseMO)) { UseMI = &PhiUseEdge->back(); UseMBB = PhiUseEdge; UseLoop = MLI->getLoopFor(PhiUseEdge); } } enum class LoopConfig { NoCur, Same, CurContainsUse, UseContainsCur, Siblings, Unrelated }; auto [LpCfg, PreHdr, CommonParent] = [&]() -> std::tuple { if (!CurLoop) { return {LoopConfig::NoCur, getOutermostPreheader(UseLoop), nullptr}; } if (CurLoop->contains(UseLoop)) { return {CurMBB == UseMBB ? LoopConfig::Same : LoopConfig::CurContainsUse, findChildPreheader(CurLoop, UseLoop), nullptr}; } if (MachineLoop *P = findCommonParent(UseLoop, CurLoop).first) { if (P != UseLoop) return {LoopConfig::Siblings, findChildPreheader(P, UseLoop), P}; return {LoopConfig::UseContainsCur, nullptr, nullptr}; } return {LoopConfig::Unrelated, getOutermostPreheader(UseLoop), nullptr}; }(); //-------------------------------------------------------------------------- // Don't PromoteToPreheader //-------------------------------------------------------------------------- if (!Cfg.PromoteToPreheader) { switch (LpCfg) { case LoopConfig::NoCur: case LoopConfig::Same: case LoopConfig::CurContainsUse: return {InstrRelative, calcShortestDistance(&CurMI, UseMI)}; case LoopConfig::UseContainsCur: { if (isIncomingValFromBackedge(LiveReg, LiveLaneMask, &CurMI, UseMI)) { return calcDistanceViaEnclosingBackedge(&CurMI, CurMBB, CurLoop, UseMI, UseMBB, UseLoop); } return {InstrInvariant, calcDistanceThroughSubLoopToUseMI( CurMBB, CurLoop, UseMI, UseMBB, UseLoop)}; } case LoopConfig::Siblings: case LoopConfig::Unrelated: return {InstrInvariant, calcDistanceThroughLoopToOutsideLoopUseMI( CurMBB, CurLoop, UseMI, UseMBB, UseLoop)}; } llvm_unreachable("unexpected loop configuration!"); } //-------------------------------------------------------------------------- // PromoteToPreheader //-------------------------------------------------------------------------- if (PreHdr) { UseMI = &PreHdr->back(); UseMBB = PreHdr; UseLoop = CommonParent; } switch (LpCfg) { case LoopConfig::NoCur: return {InstrRelative, calcShortestUnweightedDistance(&CurMI, UseMI) - (sizeOf(*UseMI) ? 0 : 1)}; case LoopConfig::Same: case LoopConfig::CurContainsUse: if (CurMBB == UseMBB && !instrsAreInOrder(&CurMI, UseMI)) return {InstrRelative, calcBackedgeDistance(&CurMI, CurMBB, CurLoop, UseMI)}; return {InstrRelative, calcShortestUnweightedDistance(&CurMI, UseMI)}; case LoopConfig::UseContainsCur: case LoopConfig::Siblings: return {InstrInvariant, calcDistanceThroughSubLoopToUseMI( CurMBB, CurLoop, UseMI, UseMBB, UseLoop)}; case LoopConfig::Unrelated: return {InstrInvariant, calcDistanceThroughLoopToOutsideLoopUseMI( CurMBB, CurLoop, UseMI, UseMBB, UseLoop)}; } llvm_unreachable("unexpected loop configuration!"); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // getUses helpers (compute mode) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: // Returns true if Use is reachable from MI. Handles backedges and intervening // defs. bool isUseReachablePrecise(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineOperand *UseMO, const MachineInstr *UseMI, const MachineBasicBlock *UseMBB) const { // Filter out uses that are clearly unreachable if (MBB != UseMBB && !isReachable(MBB, UseMBB)) return false; // PHI uses are considered part of the incoming BB. Check for reachability // at the edge. if (auto *PhiUseEdge = getIncomingBlockIfPhiUse(UseMI, UseMO)) { if (!isReachableOrSame(MBB, PhiUseEdge)) return false; } // Filter out uses with an intermediate def. const MachineInstr *DefMI = MRI->getUniqueVRegDef(UseMO->getReg()); const MachineBasicBlock *DefMBB = DefMI->getParent(); if (MBB == UseMBB) { if (UseMI->isPHI() && MBB == DefMBB) return true; if (instrsAreInOrder(&MI, UseMI)) return true; // A Def in the loop means that the value at MI will not survive through // to this use. MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); return UseLoop && !UseLoop->contains(DefMBB); } if (MBB == DefMBB) return instrsAreInOrder(DefMI, &MI); MachineLoop *Loop = MLI->getLoopFor(MBB); if (!Loop) return true; MachineLoop *TopLoop = Loop->getOutermostLoop(); return !TopLoop->contains(DefMBB) || !isReachable(MBB, DefMBB) || !isForwardReachable(UseMBB, MBB); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Debug/Developer Helpers //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: /// Goes over all MBB pairs in \p MF, calculates the shortest path between /// them. void populatePathTable() { for (const MachineBasicBlock &MBB1 : *MF) { for (const MachineBasicBlock &MBB2 : *MF) { if (&MBB1 == &MBB2) continue; getShortestPath(&MBB1, &MBB2); } } } void printPaths(raw_ostream &OS) const { OS << "\n---------------- Paths --------------- {\n"; for (const auto &[P, PI] : Paths) { OS << " " << printMBBReference(*P.src()) << " -> " << printMBBReference(*P.dst()) << ": "; PI.print(OS); OS << '\n'; } OS << "}\n"; } LLVM_DUMP_METHOD void dumpPaths() const { printPaths(dbgs()); } // Legacy alias kept for existing call sites. void dumpShortestPaths() const { for (const auto &P : Paths) { const MachineBasicBlock *From = P.first.src(); const MachineBasicBlock *To = P.first.dst(); std::optional Dist = P.second.ShortestDistance; dbgs() << "From: " << printMBBReference(*From) << "-> To:" << printMBBReference(*To) << " = " << Dist.value_or(-1).fmt() << "\n"; } } void printInterBlockDistances(raw_ostream &OS) const { using MBBPair = std::pair; using Elem = std::pair; std::vector SortedDistances; for (const auto &[FromNum, Dsts] : InterBlockDistances) { for (const auto &[ToNum, Dist] : Dsts) { SortedDistances.emplace_back(Dist.Weighted, MBBPair(FromNum, ToNum)); } } llvm::sort(SortedDistances, [](const auto &A, const auto &B) { if (A.first != B.first) return A.first < B.first; if (A.second.first != B.second.first) return A.second.first < B.second.first; return A.second.second < B.second.second; }); OS << "\n--------- InterBlockDistances -------- {\n"; for (const Elem &E : SortedDistances) { OS << " bb." << E.second.first << " -> bb." << E.second.second << ": "; E.first.print(OS); OS << '\n'; } OS << "}\n"; } LLVM_DUMP_METHOD void dumpInterBlockDistances() const { printInterBlockDistances(dbgs()); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // LiveRegUse Caching - A cache of the distances for the last // MachineInstruction. When getting the distances for a MachineInstruction, if // it is the same basic block as the cached instruction, we can generally use // an offset from the cached values to compute the distances. There are some // exceptions - see 'cacheLiveRegUse'. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: struct LiveRegToUseMapElem { LiveRegUse Use; bool MIDependent; LiveRegToUseMapElem() : Use(), MIDependent(false) {} LiveRegToUseMapElem(LiveRegUse U, bool MIDep) : Use(U), MIDependent(MIDep) {} void print(raw_ostream &OS) const { Use.print(OS); OS << (MIDependent ? " [mi-dep]" : " [mi-indep]"); } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } }; // Using std::map because LaneBitmask does not work out-of-the-box as a // DenseMap key and I did not see a performance benefit over std::map. using LaneBitmaskToUseMap = std::map; using LiveRegToUseMap = DenseMap; const MachineInstr *CachedDistancesMI = nullptr; LiveRegToUseMap CachedDistances; LiveRegToUseMap PendingCachedDistances; unsigned DistanceCacheHits = 0; unsigned DistanceCacheMisses = 0; void resetDistanceCache() { CachedDistancesMI = nullptr; CachedDistances.clear(); DistanceCacheHits = 0; DistanceCacheMisses = 0; } void maybeClearCachedLiveRegUses(const MachineInstr &MI) { if (CachedDistancesMI && (CachedDistancesMI->getParent() != MI.getParent() || !instrsAreInOrder(CachedDistancesMI, &MI))) { CachedDistancesMI = nullptr; CachedDistances.clear(); } } bool okToUseCacheElem(const LiveRegToUseMapElem &CacheElem, const MachineInstr &MI, const InstrIdTy LastDelta) { if (!CacheElem.MIDependent) return true; const LiveRegUse &U = CacheElem.Use; // Never okay to produce a negative distance if (U.Dist < LastDelta) return false; const MachineInstr *UseMI = U.Use->getParent(); // Always okay if use is in another basic block or UseMI is MI if (UseMI->getParent() != MI.getParent() || UseMI == &MI) return true; // If CachedDistancesMI <= Use < MI we could have a problem since we don't // know if Use is still reachable. return !instrsAreInOrder(CachedDistancesMI, UseMI) || !instrsAreInOrder(UseMI, &MI); } std::pair findCachedLiveRegUse(Register Reg, LaneBitmask LaneMask, const MachineInstr &MI, const InstrIdTy LastDelta) { if (!DistanceCacheEnabled) return {nullptr, nullptr}; ++DistanceCacheMisses; // Assume miss auto I = CachedDistances.find(Reg); if (I == CachedDistances.end()) return {nullptr, nullptr}; const LaneBitmaskToUseMap &RegSlot = I->second; if (RegSlot.empty()) return {nullptr, nullptr}; auto J = RegSlot.find(LaneMask); if (J == RegSlot.end()) return {nullptr, nullptr}; const LiveRegToUseMapElem &MaskSlot = J->second; if (!okToUseCacheElem(MaskSlot, MI, LastDelta)) return {nullptr, nullptr}; --DistanceCacheMisses; ++DistanceCacheHits; return {&RegSlot, &MaskSlot}; } void cacheLiveRegUse(const MachineInstr &MI, Register Reg, LaneBitmask Mask, LiveRegUse U, bool MIDependent) { if (!DistanceCacheEnabled) return; auto I = PendingCachedDistances.try_emplace(Reg).first; LaneBitmaskToUseMap &RegSlot = I->second; RegSlot.try_emplace(Mask, U, MIDependent); } void updateCachedLiveRegUses(const MachineInstr &MI) { if (!DistanceCacheEnabled) return; CachedDistancesMI = &MI; CachedDistances = std::move(PendingCachedDistances); PendingCachedDistances.clear(); LLVM_DEBUG(dumpDistanceCache()); } void printDistanceCache(raw_ostream &OS) const { OS << "\n----------- Distance Cache ----------- {\n"; OS << " CachedAt: "; if (CachedDistancesMI) OS << *CachedDistancesMI; else OS << "\n"; constexpr size_t RegNameWidth = 20; for (const auto &[Reg, ByMask] : CachedDistances) { const TargetRegisterClass *RC = MRI->getRegClass(Reg); LaneBitmask AllLanes = MRI->getMaxLaneMaskForVReg(Reg); for (const auto &[Mask, Elem] : ByMask) { std::string RegName; raw_string_ostream KOS(RegName); if (Mask == AllLanes) { KOS << printReg(Reg); } else { SmallVector Indexes; TRI->getCoveringSubRegIndexes(RC, Mask, Indexes); if (Indexes.size() == 1) KOS << printReg(Reg, TRI, Indexes.front(), MRI); else KOS << printReg(Reg) << " mask=" << Mask.getAsInteger(); } OS << " " << left_justify(RegName, RegNameWidth) << " : "; Elem.print(OS); OS << '\n'; } } OS << " (hits=" << DistanceCacheHits << " misses=" << DistanceCacheMisses << ")\n"; OS << "}\n"; } LLVM_DUMP_METHOD void dumpDistanceCache() const { printDistanceCache(dbgs()); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Processing Live Reg Uses //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: // Decompose each use in 'Uses' by sub-reg and store the nearest one in // 'UseByMask'. Ignores subregs matching 'LiveRegLaneMask' - these are handled // as registers, not sub-regs. DenseMap> SubRegIndexesForRegClass; void collectSubRegUsesByMask( const SmallVectorImpl &Uses, const SmallVectorImpl &Distances, LaneBitmask LiveRegLaneMask, LaneBitmaskToUseMap &UseByMask) { assert(Uses.size()); assert(Uses.size() == Distances.size()); const TargetRegisterClass *RC = MRI->getRegClass(Uses.front()->getReg()); auto [SRI, Inserted] = SubRegIndexesForRegClass.try_emplace(RC); if (Inserted) TRI->getCoveringSubRegIndexes(RC, LaneBitmask::getAll(), SRI->second); const SmallVector &RCSubRegIndexes = SRI->second; unsigned OneIndex; // Backing store for 'Indexes' below when 1 index for (size_t I = 0; I < Uses.size(); ++I) { const MachineOperand *MO = Uses[I]; auto [SubRegMIDep, Dist] = Distances[I]; const LiveRegUse LRU{MO, Dist}; ArrayRef Indexes; if (MO->getSubReg()) { OneIndex = MO->getSubReg(); Indexes = ArrayRef(OneIndex); } else { Indexes = RCSubRegIndexes; } for (unsigned Idx : Indexes) { LaneBitmask Mask = TRI->getSubRegIndexLaneMask(Idx); if (Mask.all() || Mask == LiveRegLaneMask) continue; auto &[SlotU, SlotMIDep] = UseByMask[Mask]; if (updateClosest(SlotU, LRU)) SlotMIDep = SubRegMIDep; } } } // Similar to 'collectSubRegUsesByMask' above, but uses cached distances. void collectSubRegUsesByMaskFromCache(const LaneBitmaskToUseMap &CachedMap, LaneBitmask LiveRegLaneMask, const MachineInstr *MI, InstrIdTy LastDelta, LaneBitmaskToUseMap &UseByMask) { for (const auto &KV : CachedMap) { LaneBitmask SubregLaneMask = KV.first; if (SubregLaneMask.all() || SubregLaneMask == LiveRegLaneMask) continue; const LiveRegToUseMapElem &SubregE = KV.second; if (!okToUseCacheElem(SubregE, *MI, LastDelta)) continue; const bool MIDep = SubregE.MIDependent; LiveRegUse U = SubregE.Use; if (MIDep) U.Dist -= LastDelta; auto &[SlotU, SlotMIDep] = UseByMask[SubregLaneMask]; if (updateClosest(SlotU, U)) SlotMIDep = MIDep; } } // Loops through 'UseByMask' finding the furthest sub-register and updating // 'FurthestSubreg' accordingly. void updateFurthestSubReg( const MachineInstr &MI, const LiveRegUse &U, const LaneBitmaskToUseMap &UseByMask, DenseMap *RelevantUses, LiveRegUse &FurthestSubreg) { if (UseByMask.empty()) { updateFurthest(FurthestSubreg, U); return; } for (const auto &KV : UseByMask) { const LiveRegUse &SubregU = KV.second.Use; const bool SubregMIDep = KV.second.MIDependent; if (RelevantUses) RelevantUses->try_emplace(SubregU.Use, SubregU); cacheLiveRegUse(MI, SubregU.Use->getReg(), KV.first, SubregU, SubregMIDep); updateFurthest(FurthestSubreg, SubregU); } } // Used to populate 'MIDefs' to be passed to 'getNextUseDistances'. SmallSet collectDefinedRegisters(const MachineInstr &MI) const { SmallSet MIDefs; for (const MachineOperand &MO : MI.all_defs()) { if (MO.isReg() && MO.getReg().isValid() && hasAtLeastOneUse(MO.getReg())) MIDefs.insert(MO.getReg()); } return MIDefs; } // Computes distances from 'MI' to each registers in 'LiveRegs'. Returns the // furthest register and (optionally) sub-register in 'Furthest' and // 'FurthestSubreg' respectively. public: void getNextUseDistances(const GCNRPTracker::LiveRegSet &LiveRegs, const MachineInstr &MI, LiveRegUse &Furthest, LiveRegUse *FurthestSubreg = nullptr, DenseMap *RelevantUses = nullptr) { const SmallSet MIDefs(collectDefinedRegisters(MI)); SmallVector Uses; SmallVector Distances; LaneBitmaskToUseMap UseByMask; maybeClearCachedLiveRegUses(MI); const InstrIdTy LastDelta = CachedDistancesMI ? getDistance(CachedDistancesMI, &MI) : 0; for (auto &KV : LiveRegs) { const Register Reg = KV.first; const LaneBitmask LaneMask = KV.second; if (MIDefs.contains(Reg)) continue; Uses.clear(); UseByMask.clear(); LiveRegUse U; bool MIDependent = false; auto [CacheMap, CacheElem] = findCachedLiveRegUse(Reg, LaneMask, MI, LastDelta); if (CacheMap && CacheElem) { MIDependent = CacheElem->MIDependent; U = CacheElem->Use; if (MIDependent) U.Dist -= LastDelta; } else { getReachableUses(Reg, LaneMask, MI, Uses); if (Uses.empty()) continue; const MachineOperand *NextUse = nullptr; NextUseDistance Dist = getShortestDistance( Reg, LaneMask, MI, Uses, &NextUse, &MIDependent, &Distances); U = LiveRegUse{NextUse, Dist}; } if (RelevantUses) RelevantUses->try_emplace(U.Use, U); cacheLiveRegUse(MI, Reg, LaneMask, U, MIDependent); updateFurthest(Furthest, U); if (!FurthestSubreg) continue; if (CacheMap) { collectSubRegUsesByMaskFromCache(*CacheMap, LaneMask, &MI, LastDelta, UseByMask); } else { collectSubRegUsesByMask(Uses, Distances, LaneMask, UseByMask); } updateFurthestSubReg(MI, U, UseByMask, RelevantUses, *FurthestSubreg); } updateCachedLiveRegUses(MI); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Helper methods for printAsJson //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private: static format_object Fmt(unsigned Id) { return format("%u", Id); } public: void printVerboseInstrFields(json::OStream &J, const MachineInstr &MI) const { J.attribute("id", getInstrId(&MI)); J.attribute("head-len", getHeadLen(&MI)); J.attribute("tail-len", getTailLen(&MI)); } void printPaths(json::OStream &J, ModuleSlotTracker &MST) const { J.attributeBegin("paths"); J.arrayBegin(); for (const auto &KV : Paths) { const Path &P = KV.first; const PathInfo &PI = KV.second; J.objectBegin(); printMBBNameAttr(J, "src", *P.src(), MST); printMBBNameAttr(J, "dst", *P.dst(), MST); if (PI.ShortestDistance.has_value()) { J.attribute("shortest-distance", PI.ShortestDistance.value().toJsonValue()); } else { J.attribute("shortest-distance", nullptr); } if (PI.ShortestUnweightedDistance.has_value()) { J.attribute("shortest-unweighted-distance", PI.ShortestUnweightedDistance.value().toJsonValue()); } else { J.attribute("shortest-unweighted-distance", nullptr); } J.attribute("edge-kind", static_cast(PI.EK)); J.attribute("reachable", PI.Reachable); J.attribute("forward-reachable", PI.ForwardReachable); J.objectEnd(); } J.arrayEnd(); J.attributeEnd(); } public: AMDGPUNextUseAnalysisImpl(const MachineFunction *, const MachineLoopInfo *); ~AMDGPUNextUseAnalysisImpl() { clearTables(); } AMDGPUNextUseAnalysis::Config getConfig() const { return Cfg; } void setConfig(AMDGPUNextUseAnalysis::Config NewCfg) { Cfg = NewCfg; clearTables(); initializeTables(); } unsigned getDistanceCacheHits() const { return DistanceCacheHits; } unsigned getDistanceCacheMisses() const { return DistanceCacheMisses; } void getReachableUses(Register LiveReg, LaneBitmask LaneMask, const MachineInstr &MI, SmallVector &Uses) const; /// \Returns the shortest next-use distance for \p LiveReg. NextUseDistance getShortestDistance(Register LiveReg, LaneBitmask LaneMask, const MachineInstr &FromMI, const SmallVector &Uses, const MachineOperand **ShortestUseOut, bool *MIDependent, SmallVector *Distances) const; NextUseDistance getShortestDistance(Register LiveReg, const MachineInstr &FromMI, const SmallVector &Uses) const { return getShortestDistance(LiveReg, LaneBitmask::getAll(), FromMI, Uses, nullptr, nullptr, nullptr); } }; AMDGPUNextUseAnalysisImpl::AMDGPUNextUseAnalysisImpl( const MachineFunction *MF, const MachineLoopInfo *ML) { this->MF = MF; this->MLI = ML; const GCNSubtarget &ST = MF->getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF->getRegInfo(); // FIXME: Hopefully we will soon converge on a single way of calculating // next-use distance and remove these presets. if (ConfigPresetOpt == "compute") Cfg = AMDGPUNextUseAnalysis::Config::Compute(); else Cfg = AMDGPUNextUseAnalysis::Config::Graphics(); if (ConfigCountPhisOpt.getNumOccurrences()) Cfg.CountPhis = ConfigCountPhisOpt; if (ConfigForwardOnlyOpt.getNumOccurrences()) Cfg.ForwardOnly = ConfigForwardOnlyOpt; if (ConfigPreciseUseModelingOpt.getNumOccurrences()) Cfg.PreciseUseModeling = ConfigPreciseUseModelingOpt; if (ConfigPromoteToPreheaderOpt.getNumOccurrences()) Cfg.PromoteToPreheader = ConfigPromoteToPreheaderOpt; initializeTables(); } NextUseDistance AMDGPUNextUseAnalysisImpl::getShortestDistance( Register LiveReg, LaneBitmask LaneMask, const MachineInstr &CurMI, const SmallVector &Uses, const MachineOperand **ShortestUseOut, bool *CurMIDependentOut, SmallVector *Distances) const { assert(!LiveReg.isPhysical() && !TRI->isAGPR(*MRI, LiveReg) && "Next-use distance is calculated for SGPRs and VGPRs"); const MachineOperand *NextUse = nullptr; auto NextUseDist = NextUseDistance::unreachable(); bool CurMIDependent = false; if (Distances) { Distances->clear(); Distances->reserve(Uses.size()); } for (auto *UseMO : Uses) { auto [Dep, D] = calcDistanceToUse(LiveReg, LaneMask, CurMI, UseMO); if (D < NextUseDist) { NextUseDist = D; NextUse = UseMO; CurMIDependent = Dep; } if (Distances) Distances->push_back({Dep, D}); } if (ShortestUseOut) *ShortestUseOut = NextUse; if (CurMIDependentOut) *CurMIDependentOut = CurMIDependent; assert(NextUseDist.isReachable() && "getShortestDistance called with no reachable uses"); return NextUseDist; } void AMDGPUNextUseAnalysisImpl::getReachableUses( Register Reg, LaneBitmask LaneMask, const MachineInstr &MI, SmallVector &Uses) const { const bool CheckMask = LaneMask != LaneBitmask::getAll() && LaneMask != MRI->getMaxLaneMaskForVReg(Reg); const MachineBasicBlock *MBB = MI.getParent(); for (const MachineOperand *UseMO : getRegisterUses(Reg)) { const MachineInstr *UseMI = UseMO->getParent(); const MachineBasicBlock *UseMBB = UseMI->getParent(); if (CheckMask && !machineOperandCoveredBy(*UseMO, LaneMask)) continue; bool Reachable; if (Cfg.PreciseUseModeling) Reachable = isUseReachablePrecise(MI, MBB, UseMO, UseMI, UseMBB); else if (MBB == UseMBB) Reachable = instrsAreInOrder(&MI, UseMI); else Reachable = isForwardReachable(MBB, UseMBB); if (Reachable) Uses.push_back(UseMO); } } //============================================================================== // AMDGPUNextUseAnalysis //============================================================================== AMDGPUNextUseAnalysis::AMDGPUNextUseAnalysis(const MachineFunction *MF, const MachineLoopInfo *MLI) { Impl = std::make_unique(MF, MLI); } AMDGPUNextUseAnalysis::AMDGPUNextUseAnalysis(AMDGPUNextUseAnalysis &&Other) : Impl(std::move(Other.Impl)) {} AMDGPUNextUseAnalysis::~AMDGPUNextUseAnalysis() {} AMDGPUNextUseAnalysis & AMDGPUNextUseAnalysis::operator=(AMDGPUNextUseAnalysis &&Other) { if (this != &Other) Impl = std::move(Other.Impl); return *this; } AMDGPUNextUseAnalysis::Config AMDGPUNextUseAnalysis::getConfig() const { return Impl->getConfig(); } void AMDGPUNextUseAnalysis::setConfig(Config Cfg) { Impl->setConfig(Cfg); } /// \Returns the next-use distance for \p LiveReg. NextUseDistance AMDGPUNextUseAnalysis::getShortestDistance( Register LiveReg, const MachineInstr &FromMI, const SmallVector &Uses, const MachineOperand **ShortestUseOut, SmallVector *DistancesOut) const { SmallVector Distances; auto Dist = Impl->getShortestDistance(LiveReg, LaneBitmask::getAll(), FromMI, Uses, ShortestUseOut, nullptr, DistancesOut ? &Distances : nullptr); if (DistancesOut) { for (auto [MIDep, D] : Distances) DistancesOut->push_back(D); } return Dist; } void AMDGPUNextUseAnalysis::getNextUseDistances( const DenseMap &LiveRegs, const MachineInstr &MI, UseDistancePair &FurthestOut, UseDistancePair *FurthestSubregOut, DenseMap *RelevantUses) const { LiveRegUse Furthest; LiveRegUse FurthestSubreg; Impl->getNextUseDistances(LiveRegs, MI, Furthest, FurthestSubregOut ? &FurthestSubreg : nullptr, RelevantUses); FurthestOut = Furthest; if (FurthestSubregOut) *FurthestSubregOut = FurthestSubreg; } void AMDGPUNextUseAnalysis::getReachableUses( Register LiveReg, LaneBitmask LaneMask, const MachineInstr &MI, SmallVector &Uses) const { return Impl->getReachableUses(LiveReg, LaneMask, MI, Uses); } //============================================================================== // AMDGPUNextUseAnalysisLegacyPass //============================================================================== //------------------------------------------------------------------------------ // Legacy Analysis Pass //------------------------------------------------------------------------------ AMDGPUNextUseAnalysisLegacyPass::AMDGPUNextUseAnalysisLegacyPass() : MachineFunctionPass(ID) {} StringRef AMDGPUNextUseAnalysisLegacyPass::getPassName() const { return "Next Use Analysis"; } bool AMDGPUNextUseAnalysisLegacyPass::runOnMachineFunction( MachineFunction &MF) { const MachineLoopInfo *MLI = &getAnalysis().getLI(); NUA.reset(new AMDGPUNextUseAnalysis(&MF, MLI)); return false; } void AMDGPUNextUseAnalysisLegacyPass::getAnalysisUsage( AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } char AMDGPUNextUseAnalysisLegacyPass::ID = 0; char &llvm::AMDGPUNextUseAnalysisLegacyID = AMDGPUNextUseAnalysisLegacyPass::ID; INITIALIZE_PASS_BEGIN(AMDGPUNextUseAnalysisLegacyPass, DEBUG_TYPE, "Next Use Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUNextUseAnalysisLegacyPass, DEBUG_TYPE, "Next Use Analysis", false, true) FunctionPass *llvm::createAMDGPUNextUseAnalysisLegacyPass() { return new AMDGPUNextUseAnalysisLegacyPass(); } //------------------------------------------------------------------------------ // New Pass Manager Analysis Pass //------------------------------------------------------------------------------ AnalysisKey AMDGPUNextUseAnalysisPass::Key; AMDGPUNextUseAnalysisPass::Result AMDGPUNextUseAnalysisPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { const MachineLoopInfo &MLI = MFAM.getResult(MF); return AMDGPUNextUseAnalysis(&MF, &MLI); } //============================================================================== // AMDGPUNextUseAnalysisPrinterLegacyPass //============================================================================== namespace { void printInstrMember(json::OStream &J, ModuleSlotTracker &MST, const MachineInstr &MI, const AMDGPUNextUseAnalysisImpl &NUA) { printStringAttr(J, "instr", MI, MST); if (DumpNextUseDistanceVerbose) NUA.printVerboseInstrFields(J, MI); } void printDistances( json::OStream &J, const MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, ModuleSlotTracker &MST, const DenseMap &Uses) { if (!DumpNextUseDistanceVerbose) return; // Sorting isn't necessary for the purposes of JSON, but it reduces // FileCheck differences. SmallVector Keys; for (const MachineOperand *K : Uses.keys()) Keys.push_back(K); llvm::sort(Keys, [](const auto &A, const auto &B) { return A->getReg() < B->getReg() || (A->getReg() == B->getReg() && A->getSubReg() < B->getSubReg()); }); J.attributeBegin("distances"); J.objectBegin(); for (const MachineOperand *K : Keys) { const LiveRegUse U = Uses.at(K); printAttr(J, printReg(U.getReg(), &TRI, U.getSubReg(), &MRI), U.Dist.toJsonValue()); } J.objectEnd(); J.attributeEnd(); } void printFurthestUse(json::OStream &J, const MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, ModuleSlotTracker &MST, const LiveRegUse F, bool Subreg = false) { J.attributeBegin(Subreg ? "furthest-subreg" : "furthest"); J.objectBegin(); if (F.Use) { printStringAttr( J, "register", printReg(F.getReg(), &TRI, Subreg ? F.getSubReg() : 0, &MRI)); if (DumpNextUseDistanceVerbose) { printStringAttr(J, "use", [&](raw_ostream &OS) { OS << (*F.Use); }); printStringAttr(J, "use-mi", *F.Use->getParent(), MST); } J.attribute("distance", F.Dist.toJsonValue()); } J.objectEnd(); J.attributeEnd(); } void printDistanceFromDefToUse(json::OStream &J, const MachineFunction &MF, const AMDGPUNextUseAnalysis &NUA, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { auto getRegNextUseDistance = [&](Register DefReg) { const MachineInstr &DefMI = *MRI.def_instr_begin(DefReg); SmallVector Uses; NUA.getReachableUses(DefReg, LaneBitmask::getAll(), DefMI, Uses); if (Uses.empty()) return NextUseDistance::unreachable(); return NUA.getShortestDistance(DefReg, DefMI, Uses); }; J.attributeBegin("distance-from-def-to-closest-use"); J.objectBegin(); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { for (const MachineOperand &MO : MI.all_defs()) { Register Reg = MO.getReg(); if (Reg.isPhysical()) continue; NextUseDistance D = getRegNextUseDistance(Reg); printAttr(J, printReg(Reg, &TRI, 0, &MRI), D.toJsonValue()); } } } J.objectEnd(); J.attributeEnd(); } void printNextUseDistancesAsJson(json::OStream &J, const MachineFunction &MF, const AMDGPUNextUseAnalysis &NUA, const AMDGPUNextUseAnalysisImpl &NUAImpl, const LiveIntervals &LIS) { using UseDistancePair = AMDGPUNextUseAnalysis::UseDistancePair; const Function &F = MF.getFunction(); const Module *M = F.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); // We don't actually care about register pressure here - just using // GCNDownwardRPTracker as a convenient way of getting the set of live // registers at a given instruction. GCNDownwardRPTracker RPTracker(LIS); ModuleSlotTracker MST(M); MST.incorporateFunction(F); DenseMap RelevantUses; J.attributeBegin("furthest-distances"); J.objectBegin(); for (const MachineBasicBlock &MBB : MF) { std::string BBName; raw_string_ostream BBOS(BBName); MBB.printName(BBOS, MachineBasicBlock::PrintNameIr, &MST); J.attributeBegin(BBOS.str()); J.arrayBegin(); const MachineInstr *PrevMI = nullptr; for (const MachineInstr &MI : MBB) { // Update register pressure tracker if (!PrevMI || PrevMI->getOpcode() == AMDGPU::PHI) RPTracker.reset(MI); RPTracker.advance(); UseDistancePair Furthest; UseDistancePair FurthestSubreg; RelevantUses.clear(); NUA.getNextUseDistances(RPTracker.getLiveRegs(), MI, Furthest, &FurthestSubreg, &RelevantUses); J.objectBegin(); printInstrMember(J, MST, MI, NUAImpl); printDistances(J, MRI, TRI, MST, RelevantUses); printFurthestUse(J, MRI, TRI, MST, Furthest); printFurthestUse(J, MRI, TRI, MST, FurthestSubreg, /*Subreg*/ true); J.objectEnd(); PrevMI = &MI; } J.arrayEnd(); J.attributeEnd(); } J.objectEnd(); J.attributeEnd(); if (DumpNextUseDistanceVerbose || DumpNextUseDistanceDefToUse) printDistanceFromDefToUse(J, MF, NUA, TRI, MRI); if (DumpNextUseDistanceVerbose) NUAImpl.printPaths(J, MST); if (DistanceCacheEnabled) { J.attributeBegin("metrics"); J.objectBegin(); { J.attributeBegin("distance-cache"); J.objectBegin(); { J.attribute("hits", NUAImpl.getDistanceCacheHits()); J.attribute("misses", NUAImpl.getDistanceCacheMisses()); } J.objectEnd(); J.attributeEnd(); // distance-cache } J.objectEnd(); J.attributeEnd(); // metrics } } void printAsJson(raw_ostream &FallbackOS, TimerGroup &JsonTimerGroup, Timer &JsonTimer, const MachineFunction &MF, const AMDGPUNextUseAnalysis &NUA, const AMDGPUNextUseAnalysisImpl &NUAImpl, const LiveIntervals &LIS) { std::string FN = DumpNextUseDistanceAsJson; auto dump = [&](raw_ostream &OS) { json::OStream J(OS, 2); J.objectBegin(); J.attributeBegin("next-use-analysis"); J.objectBegin(); printNextUseDistancesAsJson(J, MF, NUA, NUAImpl, LIS); J.objectEnd(); J.attributeEnd(); JsonTimer.stopTimer(); JsonTimerGroup.printJSONValues(OS, ",\n"); J.objectEnd(); }; if (!DumpNextUseDistanceAsJson.getNumOccurrences()) { dump(FallbackOS); } else if (FN.empty() || FN == "-") { dump(outs()); } else { std::error_code EC; ToolOutputFile OutF(FN, EC, sys::fs::OF_None); dump(OutF.os()); OutF.keep(); } } } // namespace //------------------------------------------------------------------------------ // Legacy Printer Pass //------------------------------------------------------------------------------ AMDGPUNextUseAnalysisPrinterLegacyPass::AMDGPUNextUseAnalysisPrinterLegacyPass() : MachineFunctionPass(ID) {} StringRef AMDGPUNextUseAnalysisPrinterLegacyPass::getPassName() const { return "AMDGPU Next Use Analysis Printer"; } bool AMDGPUNextUseAnalysisPrinterLegacyPass::runOnMachineFunction( MachineFunction &MF) { TimerGroup JsonTimerGroup("amdgpu-next-use-analysis-json", "AMDGPU Next Use Analysis JSON Printer", false); Timer JsonTimer("json", "Total time spent generating json", JsonTimerGroup); JsonTimer.startTimer(); const LiveIntervals &LIS = getAnalysis().getLIS(); const AMDGPUNextUseAnalysis &NUA = getAnalysis().getNextUseAnalysis(); printAsJson(errs(), JsonTimerGroup, JsonTimer, MF, NUA, *NUA.Impl, LIS); return false; } void AMDGPUNextUseAnalysisPrinterLegacyPass::getAnalysisUsage( AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } char AMDGPUNextUseAnalysisPrinterLegacyPass::ID = 0; char &AMDGPUNextUseAnalysisPrinterLegacyID = AMDGPUNextUseAnalysisPrinterLegacyPass::ID; INITIALIZE_PASS_BEGIN(AMDGPUNextUseAnalysisPrinterLegacyPass, "amdgpu-next-use-printer", "AMDGPU Next Use Analysis Printer", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_END(AMDGPUNextUseAnalysisPrinterLegacyPass, "amdgpu-next-use-printer", "AMDGPU Next Use Analysis Printer", false, false) FunctionPass *llvm::createAMDGPUNextUseAnalysisPrinterLegacyPass() { return new AMDGPUNextUseAnalysisPrinterLegacyPass(); } //------------------------------------------------------------------------------ // New Pass Manager Printer Pass //------------------------------------------------------------------------------ PreservedAnalyses AMDGPUNextUseAnalysisPrinterPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { TimerGroup JsonTimerGroup("amdgpu-next-use-analysis-json", "AMDGPU Next Use Analysis JSON Printer", false); Timer JsonTimer("json", "Total time spent generating json", JsonTimerGroup); JsonTimer.startTimer(); const LiveIntervals &LIS = MFAM.getResult(MF); const AMDGPUNextUseAnalysis &NUA = MFAM.getResult(MF); printAsJson(OS, JsonTimerGroup, JsonTimer, MF, NUA, *NUA.Impl, LIS); return PreservedAnalyses::all(); }