Reland https://github.com/llvm/llvm-project/pull/184929 after fixing some issues in the NDEBUG builds. 3a640ee is unchanged from the previously approved PR, the unreviewed portion of this PR is 9cabd8d
335 lines
12 KiB
C++
335 lines
12 KiB
C++
//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Coexecution-focused scheduling strategy for AMDGPU.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|
|
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|
|
|
|
#include "GCNSchedStrategy.h"
|
|
#include "llvm/CodeGen/MachineScheduler.h"
|
|
|
|
namespace llvm {
|
|
|
|
namespace AMDGPU {
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Instruction Flavor Classification
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
enum class InstructionFlavor : uint8_t {
|
|
WMMA, // WMMA/MFMA matrix operations
|
|
SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
|
|
TRANS, // Transcendental ops (v_exp, v_log, etc.)
|
|
MultiCycleVALU, // VALU instructions with repeat rate > 1
|
|
VMEM, // FLAT/GLOBAL memory operations
|
|
DS, // LDS/GDS operations
|
|
SALU, // Scalar ALU
|
|
DMA, // Tensor DMA operations
|
|
Fence, // Fences and waits
|
|
Other, // Everything else
|
|
NUM_FLAVORS
|
|
};
|
|
|
|
inline StringRef getFlavorName(InstructionFlavor F) {
|
|
switch (F) {
|
|
case InstructionFlavor::WMMA:
|
|
return "WMMA";
|
|
case InstructionFlavor::SingleCycleVALU:
|
|
return "VALU(1c)";
|
|
case InstructionFlavor::TRANS:
|
|
return "TRANS";
|
|
case InstructionFlavor::MultiCycleVALU:
|
|
return "VALU(Nc)";
|
|
case InstructionFlavor::VMEM:
|
|
return "VMEM";
|
|
case InstructionFlavor::DS:
|
|
return "DS";
|
|
case InstructionFlavor::SALU:
|
|
return "SALU";
|
|
case InstructionFlavor::DMA:
|
|
return "DMA";
|
|
case InstructionFlavor::Fence:
|
|
return "Fence";
|
|
case InstructionFlavor::Other:
|
|
return "Other";
|
|
case InstructionFlavor::NUM_FLAVORS:
|
|
return "???";
|
|
}
|
|
llvm_unreachable("Unknown InstructionFlavor");
|
|
}
|
|
|
|
inline StringRef getFlavorShortName(InstructionFlavor F) {
|
|
switch (F) {
|
|
case InstructionFlavor::WMMA:
|
|
return "W";
|
|
case InstructionFlavor::SingleCycleVALU:
|
|
return "V";
|
|
case InstructionFlavor::TRANS:
|
|
return "T";
|
|
case InstructionFlavor::MultiCycleVALU:
|
|
return "C";
|
|
case InstructionFlavor::VMEM:
|
|
return "M";
|
|
case InstructionFlavor::DS:
|
|
return "D";
|
|
case InstructionFlavor::SALU:
|
|
return "S";
|
|
case InstructionFlavor::DMA:
|
|
return "X";
|
|
case InstructionFlavor::Fence:
|
|
return "F";
|
|
case InstructionFlavor::Other:
|
|
return "O";
|
|
case InstructionFlavor::NUM_FLAVORS:
|
|
return "?";
|
|
}
|
|
llvm_unreachable("Unknown InstructionFlavor");
|
|
}
|
|
|
|
InstructionFlavor classifyFlavor(const MachineInstr &MI,
|
|
const SIInstrInfo &SII);
|
|
|
|
using FlavorGroup = SmallVector<InstructionFlavor, 4>;
|
|
|
|
namespace FlavorGroups {
|
|
inline FlavorGroup allVALU() {
|
|
return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
|
|
InstructionFlavor::MultiCycleVALU};
|
|
}
|
|
inline FlavorGroup allMem() {
|
|
return {InstructionFlavor::VMEM, InstructionFlavor::DS,
|
|
InstructionFlavor::DMA};
|
|
}
|
|
inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
|
|
inline FlavorGroup all() {
|
|
FlavorGroup G;
|
|
for (unsigned I = 0;
|
|
I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
|
|
G.push_back(static_cast<InstructionFlavor>(I));
|
|
return G;
|
|
}
|
|
} // namespace FlavorGroups
|
|
|
|
/// AMDGPU-specific scheduling decision reasons. These provide more granularity
|
|
/// than the generic CandReason enum for debugging purposes.
|
|
enum class AMDGPUSchedReason : uint8_t {
|
|
None,
|
|
CritResourceBalance, // tryCriticalResource chose based on resource pressure
|
|
CritResourceDep, // tryCriticalResourceDependency chose based on enabling
|
|
NUM_REASONS
|
|
};
|
|
|
|
inline StringRef getReasonName(AMDGPUSchedReason R) {
|
|
switch (R) {
|
|
case AMDGPUSchedReason::None:
|
|
return "None";
|
|
case AMDGPUSchedReason::CritResourceBalance:
|
|
return "CritResource";
|
|
case AMDGPUSchedReason::CritResourceDep:
|
|
return "CritResourceDep";
|
|
case AMDGPUSchedReason::NUM_REASONS:
|
|
return "???";
|
|
}
|
|
llvm_unreachable("Unknown AMDGPUSchedReason");
|
|
}
|
|
|
|
} // End namespace AMDGPU
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Hardware Unit Information
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// HardwareUnitInfo is a wrapper class which maps to some real hardware
|
|
/// resource. This is used to model hardware resource pressure per region, and
|
|
/// guide scheduling heuristics.
|
|
class HardwareUnitInfo {
|
|
private:
|
|
/// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
|
|
/// for this HardwareUnit. This is used for agreement between
|
|
/// tryCriticalResourceDependency and tryCriticalResource: we schedule the
|
|
/// dependencies for a SU on critical resource, then schedule that same SU on
|
|
/// the critical resource. This agreement results in shorter live ranges and
|
|
/// more regular HardwareUnit access patterns. SUs are prioritized based on
|
|
/// depth for top-down scheduling.
|
|
SmallSetVector<SUnit *, 16> PrioritySUs;
|
|
/// All the SUs in the region that consume this resource
|
|
SmallSetVector<SUnit *, 16> AllSUs;
|
|
/// The total number of busy cycles for this HardwareUnit for a given region.
|
|
unsigned TotalCycles = 0;
|
|
// InstructionFlavor mapping
|
|
AMDGPU::InstructionFlavor Type;
|
|
// Whether or not instructions on this HardwareUnit may produce a window in
|
|
// which instructions in other HardwareUnits can coexecute. For example, WMMA
|
|
// / MFMA instructions may take multiple cycles, which may be overlapped with
|
|
// instructions on other HardwareUnits
|
|
bool ProducesCoexecWindow = false;
|
|
|
|
public:
|
|
HardwareUnitInfo() {}
|
|
|
|
unsigned size() { return AllSUs.size(); }
|
|
|
|
unsigned getTotalCycles() { return TotalCycles; }
|
|
|
|
void setType(unsigned TheType) {
|
|
assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
|
|
Type = (AMDGPU::InstructionFlavor)(TheType);
|
|
}
|
|
|
|
AMDGPU::InstructionFlavor getType() const { return Type; }
|
|
|
|
bool producesCoexecWindow() const { return ProducesCoexecWindow; }
|
|
|
|
void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
|
|
|
|
bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
|
|
|
|
/// \returns true if there is a difference in priority between \p SU and \p
|
|
/// Other. If so, \returns the SUnit with higher priority. This
|
|
/// method looks through the PrioritySUs to determine if one SU is more
|
|
/// prioritized than the other. If neither are in the PrioritySUs list, then
|
|
/// neither have priority over each other.
|
|
SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
|
|
for (auto *SUOrder : PrioritySUs) {
|
|
if (SUOrder == SU)
|
|
return SU;
|
|
|
|
if (SUOrder == Other)
|
|
return Other;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void reset() {
|
|
AllSUs.clear();
|
|
PrioritySUs.clear();
|
|
TotalCycles = 0;
|
|
Type = AMDGPU::InstructionFlavor::Other;
|
|
ProducesCoexecWindow = false;
|
|
}
|
|
|
|
/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
|
|
/// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
|
|
/// ready) to AllSUs to attempt to find a target SU. When looking through
|
|
/// AllSUs we sort pick the target SU by minimal depth for top-down
|
|
/// scheduling. getNextTargetSU is useful for determining which SU on this
|
|
/// HardwareUnit we are trying to schedule - this info helps us determine
|
|
/// which dependencies to schedule. LookDeep is useful if the dependencies are
|
|
/// long latency (e.g. memory instructions). If we have many long latency
|
|
/// dependencies, it is beneficial to enable SUs multiple levels ahead.
|
|
SUnit *getNextTargetSU(bool LookDeep = false) const;
|
|
/// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
|
|
/// the TotalCycles. This maintains the list of PrioritySUs.
|
|
void insert(SUnit *SU, unsigned BlockingCycles);
|
|
/// Update the state for \p SU being scheduled by removing it from the AllSus
|
|
/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
|
|
/// the list of PrioritySUS.
|
|
void markScheduled(SUnit *SU, unsigned BlockingCycles);
|
|
};
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Candidate Heuristics
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// CandidateHeuristics contains state and implementations to facilitate making
|
|
/// per instruction scheduling decisions; it contains methods used in
|
|
/// tryCandidate to decide which instruction to schedule next.
|
|
class CandidateHeuristics {
|
|
protected:
|
|
ScheduleDAGMI *DAG;
|
|
const SIInstrInfo *SII;
|
|
const SIRegisterInfo *SRI;
|
|
const TargetSchedModel *SchedModel;
|
|
SmallVector<HardwareUnitInfo, 8> HWUInfo;
|
|
|
|
/// Walk over the region and collect total usage per HardwareUnit
|
|
void collectHWUIPressure();
|
|
|
|
/// Compute the blocking cycles for the appropriate HardwareUnit given an \p
|
|
/// SU
|
|
unsigned getHWUICyclesForInst(SUnit *SU);
|
|
|
|
/// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
|
|
/// mapped HardwareUnit.
|
|
HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
|
|
|
|
public:
|
|
CandidateHeuristics() = default;
|
|
|
|
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
|
|
const TargetRegisterInfo *TRI);
|
|
|
|
/// Update the state to reflect that \p SU is going to be scheduled.
|
|
void updateForScheduling(SUnit *SU);
|
|
|
|
/// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
|
|
/// priority are first. Priority is determined by maximizing coexecution and
|
|
/// keeping the critical HardwareUnit busy.
|
|
void sortHWUIResources();
|
|
|
|
/// Check for critical resource consumption. Prefer the candidate that uses
|
|
/// the most prioritized HardwareUnit. If both candidates use the same
|
|
/// HarwareUnit, prefer the candidate with higher priority on that
|
|
/// HardwareUnit.
|
|
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
|
|
GenericSchedulerBase::SchedCandidate &Cand,
|
|
SchedBoundary *Zone) const;
|
|
|
|
/// Check for dependencies of instructions that use prioritized HardwareUnits.
|
|
/// Prefer the candidate that is a dependency of an instruction that uses the
|
|
/// most prioritized HardwareUnit. If both candidates enable the same
|
|
/// HardwareUnit, prefer the candidate that enables the higher priority
|
|
/// instruction on that HardwareUnit.
|
|
bool
|
|
tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
|
|
GenericSchedulerBase::SchedCandidate &Cand,
|
|
SchedBoundary *Zone) const;
|
|
|
|
void dumpRegionSummary();
|
|
};
|
|
|
|
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
|
|
protected:
|
|
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
|
|
SchedBoundary &Zone) const;
|
|
AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
|
|
CandidateHeuristics Heurs;
|
|
|
|
#ifndef NDEBUG
|
|
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
|
|
#endif
|
|
|
|
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
|
|
SchedBoundary *Zone);
|
|
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
|
|
const RegPressureTracker &RPTracker,
|
|
SchedCandidate &Cand, bool &PickedPending,
|
|
bool IsBottomUp);
|
|
|
|
public:
|
|
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
|
|
|
|
void initPolicy(MachineBasicBlock::iterator Begin,
|
|
MachineBasicBlock::iterator End,
|
|
unsigned NumRegionInstrs) override;
|
|
void initialize(ScheduleDAGMI *DAG) override;
|
|
SUnit *pickNode(bool &IsTopNode) override;
|
|
void schedNode(SUnit *SU, bool IsTopNode) override;
|
|
};
|
|
|
|
ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
|
|
ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
|
|
|
|
} // End namespace llvm
|
|
|
|
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|