llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp

//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Coexecution-focused scheduling strategy for AMDGPU.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUCoExecSchedStrategy.h"
#include "llvm/Support/Debug.h"

using namespace llvm;
using namespace llvm::AMDGPU;

#define DEBUG_TYPE "machine-scheduler"

namespace {

// Used to disable post-RA scheduling with function level granularity.
class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
public:
  explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
      : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}

  // Do nothing.
  void schedule() override {}
};

} // namespace

static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
  // pickOnlyChoice() releases pending instructions and checks for new hazards.
  SUnit *OnlyChoice = Zone.pickOnlyChoice();
  if (!Zone.Pending.empty())
    return nullptr;

  return OnlyChoice;
}

InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
                                               const SIInstrInfo &SII) {
  if (MI.isDebugInstr())
    return InstructionFlavor::Other;

  unsigned Opc = MI.getOpcode();

  // Check for specific opcodes first.
  if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
      Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
    return InstructionFlavor::Fence;

  if (SII.isLDSDMA(MI))
    return InstructionFlavor::DMA;

  if (SII.isMFMAorWMMA(MI))
    return InstructionFlavor::WMMA;

  if (SII.isTRANS(MI))
    return InstructionFlavor::TRANS;

  if (SII.isVALU(MI))
    return InstructionFlavor::SingleCycleVALU;

  if (SII.isDS(MI))
    return InstructionFlavor::DS;

  if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
    return InstructionFlavor::VMEM;

  if (SII.isSALU(MI))
    return InstructionFlavor::SALU;

  return InstructionFlavor::Other;
}

SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
  for (auto *PrioritySU : PrioritySUs) {
    if (!PrioritySU->isTopReady())
      return PrioritySU;
  }

  if (!LookDeep)
    return nullptr;

  unsigned MinDepth = std::numeric_limits<unsigned int>::max();
  SUnit *TargetSU = nullptr;
  for (auto *SU : AllSUs) {
    if (SU->isScheduled)
      continue;

    if (SU->isTopReady())
      continue;

    if (SU->getDepth() < MinDepth) {
      MinDepth = SU->getDepth();
      TargetSU = SU;
    }
  }
  return TargetSU;
}

void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
#ifndef NDEBUG
  bool Inserted = AllSUs.insert(SU);
  assert(Inserted);
#else
  AllSUs.insert(SU);
#endif

  TotalCycles += BlockingCycles;

  if (PrioritySUs.empty()) {
    PrioritySUs.insert(SU);
    return;
  }
  unsigned SUDepth = SU->getDepth();
  unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
  if (SUDepth > CurrDepth)
    return;

  if (SUDepth == CurrDepth) {
    PrioritySUs.insert(SU);
    return;
  }

  // SU is lower depth and should be prioritized.
  PrioritySUs.clear();
  PrioritySUs.insert(SU);
}

void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
  // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
  // we just clear the HWUI. However, we still have instructions which map to
  // this HWUI. Don't bother managing the state for these HWUI.
  if (TotalCycles == 0)
    return;

  AllSUs.remove(SU);
  PrioritySUs.remove(SU);

  TotalCycles -= BlockingCycles;

  if (AllSUs.empty())
    return;
  if (PrioritySUs.empty()) {
    for (auto SU : AllSUs) {
      if (PrioritySUs.empty()) {
        PrioritySUs.insert(SU);
        continue;
      }
      unsigned SUDepth = SU->getDepth();
      unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
      if (SUDepth > CurrDepth)
        continue;

      if (SUDepth == CurrDepth) {
        PrioritySUs.insert(SU);
        continue;
      }

      // SU is lower depth and should be prioritized.
      PrioritySUs.clear();
      PrioritySUs.insert(SU);
    }
  }
}

HardwareUnitInfo *
CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
  for (auto &HWUICand : HWUInfo) {
    if (HWUICand.getType() == Flavor) {
      return &HWUICand;
    }
  }
  return nullptr;
}

unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
  assert(SchedModel && SchedModel->hasInstrSchedModel());
  unsigned ReleaseAtCycle = 0;
  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
  for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
                                     PE = SchedModel->getWriteProcResEnd(SC);
       PI != PE; ++PI) {
    ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
  }
  return ReleaseAtCycle;
}

void CandidateHeuristics::updateForScheduling(SUnit *SU) {
  HardwareUnitInfo *HWUI =
      getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII));
  assert(HWUI);
  HWUI->markScheduled(SU, getHWUICyclesForInst(SU));
}

void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
                                     const TargetSchedModel *TargetSchedModel,
                                     const TargetRegisterInfo *TRI) {
  DAG = SchedDAG;
  SchedModel = TargetSchedModel;
  assert(SchedModel && SchedModel->hasInstrSchedModel());

  SRI = static_cast<const SIRegisterInfo *>(TRI);
  SII = static_cast<const SIInstrInfo *>(DAG->TII);

  HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);

  for (unsigned I = 0; I < HWUInfo.size(); I++) {
    HWUInfo[I].reset();
    HWUInfo[I].setType(I);
  }

  HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
  HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
  HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);

  collectHWUIPressure();
}

void CandidateHeuristics::collectHWUIPressure() {
  if (!SchedModel || !SchedModel->hasInstrSchedModel())
    return;

  for (auto &SU : DAG->SUnits) {
    const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
    HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
  }

  LLVM_DEBUG(dumpRegionSummary());
}

void CandidateHeuristics::dumpRegionSummary() {
  MachineBasicBlock *BB = DAG->begin()->getParent();
  dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
         << " (" << DAG->SUnits.size() << " SUs) ===\n";

  dbgs() << "\nHWUI Resource Pressure:\n";
  for (auto &HWUI : HWUInfo) {
    if (HWUI.getTotalCycles() == 0)
      continue;

    StringRef Name = getFlavorName(HWUI.getType());
    dbgs() << "  " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
           << HWUI.size() << " instrs\n";
  }
  dbgs() << "\n";
}

void CandidateHeuristics::sortHWUIResources() {
  // Highest priority should be first.
  llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
    // Prefer CoexecWindow producers
    if (A.producesCoexecWindow() != B.producesCoexecWindow())
      return A.producesCoexecWindow();

    // Prefer more demanded resources
    if (A.getTotalCycles() != B.getTotalCycles())
      return A.getTotalCycles() > B.getTotalCycles();

    // In ties -- prefer the resource with more instructions
    if (A.size() != B.size())
      return A.size() < B.size();

    // Default to Flavor order
    return (unsigned)A.getType() < (unsigned)B.getType();
  });
}

bool CandidateHeuristics::tryCriticalResourceDependency(
    GenericSchedulerBase::SchedCandidate &TryCand,
    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {

  auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
    const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];

    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
    auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
    bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
                     TryCandFlavor == InstructionFlavor::DS) &&
                    HWUI.getType() == InstructionFlavor::WMMA;
    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);

    // If we do not have a TargetSU for this resource, then it is not critical.
    if (!TargetSU)
      return false;

    return true;
  };

  auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
    const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);

    // We want to ensure our DS order matches WMMA order.
    bool LookDeep = CandFlavor == InstructionFlavor::DS &&
                    HWUI.getType() == InstructionFlavor::WMMA;
    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);

    bool CandEnables =
        TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
    bool TryCandEnables =
        TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);

    if (!CandEnables && !TryCandEnables)
      return false;

    if (CandEnables && !TryCandEnables) {
      if (Cand.Reason > GenericSchedulerBase::RegCritical)
        Cand.Reason = GenericSchedulerBase::RegCritical;

      return true;
    }

    if (!CandEnables && TryCandEnables) {
      TryCand.Reason = GenericSchedulerBase::RegCritical;
      return true;
    }

    // Both enable, prefer the critical path.
    unsigned CandHeight = Cand.SU->getHeight();
    unsigned TryCandHeight = TryCand.SU->getHeight();

    if (CandHeight > TryCandHeight) {
      if (Cand.Reason > GenericSchedulerBase::RegCritical)
        Cand.Reason = GenericSchedulerBase::RegCritical;

      return true;
    }

    if (CandHeight < TryCandHeight) {
      TryCand.Reason = GenericSchedulerBase::RegCritical;
      return true;
    }

    // Same critical path, just prefer original candidate.
    if (Cand.Reason > GenericSchedulerBase::RegCritical)
      Cand.Reason = GenericSchedulerBase::RegCritical;

    return true;
  };

  for (unsigned I = 0; I < HWUInfo.size(); I++) {
    // If we have encountered a resource that is not critical, then neither
    // candidate enables a critical resource
    if (!HasPrioritySU(I))
      continue;

    bool Enabled = TryEnablesResource(I);
    // If neither has enabled the resource, continue to the next resource
    if (Enabled)
      return true;
  }
  return false;
}

bool CandidateHeuristics::tryCriticalResource(
    GenericSchedulerBase::SchedCandidate &TryCand,
    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
  for (unsigned I = 0; I < HWUInfo.size(); I++) {
    const HardwareUnitInfo &HWUI = HWUInfo[I];

    bool CandUsesCrit = HWUI.contains(Cand.SU);
    bool TryCandUsesCrit = HWUI.contains(TryCand.SU);

    if (!CandUsesCrit && !TryCandUsesCrit)
      continue;

    if (CandUsesCrit != TryCandUsesCrit) {
      if (CandUsesCrit) {
        if (Cand.Reason > GenericSchedulerBase::RegCritical)
          Cand.Reason = GenericSchedulerBase::RegCritical;
        return true;
      }
      TryCand.Reason = GenericSchedulerBase::RegCritical;
      return true;
    }

    // Otherwise, both use the critical resource
    // For longer latency InstructionFlavors, we should prioritize first by
    // their enablement of critical resources
    if (HWUI.getType() == InstructionFlavor::DS) {
      if (tryCriticalResourceDependency(TryCand, Cand, Zone))
        return true;
    }

    // Prioritize based on HWUI priorities.
    SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
    if (Match) {
      if (Match == Cand.SU) {
        if (Cand.Reason > GenericSchedulerBase::RegCritical)
          Cand.Reason = GenericSchedulerBase::RegCritical;
        return true;
      }
      TryCand.Reason = GenericSchedulerBase::RegCritical;
      return true;
    }
  }

  return false;
}

AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
    const MachineSchedContext *C)
    : GCNSchedStrategy(C) {
  SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
  SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
  // Use more accurate GCN pressure trackers.
  UseGCNTrackers = true;
}

void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
                                           MachineBasicBlock::iterator End,
                                           unsigned NumRegionInstrs) {
  GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
  assert((PreRADirection == MISched::Unspecified ||
          PreRADirection == MISched::TopDown) &&
         "coexec scheduler only supports top-down scheduling");
  RegionPolicy.OnlyTopDown = true;
  RegionPolicy.OnlyBottomUp = false;
  RegionPolicy.ShouldTrackLaneMasks = true;
}

void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
  // Coexecution scheduling strategy is only done top-down to support new
  // resource balancing heuristics.
  RegionPolicy.OnlyTopDown = true;
  RegionPolicy.OnlyBottomUp = false;

  GCNSchedStrategy::initialize(DAG);
  Heurs.initialize(DAG, SchedModel, TRI);
}

void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
  Heurs.updateForScheduling(SU);
  GCNSchedStrategy::schedNode(SU, IsTopNode);
}

SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
  assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
         "coexec scheduler only supports top-down scheduling");

  if (DAG->top() == DAG->bottom()) {
    assert(Top.Available.empty() && Top.Pending.empty() &&
           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
    return nullptr;
  }

  bool PickedPending = false;
  SUnit *SU = nullptr;
#ifndef NDEBUG
  SchedCandidate *PickedCand = nullptr;
#endif
  do {
    PickedPending = false;
    SU = pickOnlyChoice(Top);
    if (!SU) {
      CandPolicy NoPolicy;
      TopCand.reset(NoPolicy);
      pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
                        PickedPending, /*IsBottomUp=*/false);
      assert(TopCand.Reason != NoCand && "failed to find a candidate");
      SU = TopCand.SU;
#ifndef NDEBUG
      PickedCand = &TopCand;
#endif
    }
    IsTopNode = true;
  } while (SU->isScheduled);

  LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));

  if (PickedPending) {
    unsigned ReadyCycle = SU->TopReadyCycle;
    unsigned CurrentCycle = Top.getCurrCycle();
    if (ReadyCycle > CurrentCycle)
      Top.bumpCycle(ReadyCycle);

    // checkHazard() does not expose the exact cycle where the hazard clears.
    while (Top.checkHazard(SU))
      Top.bumpCycle(Top.getCurrCycle() + 1);

    Top.releasePending();
  }

  if (SU->isTopReady())
    Top.removeReady(SU);
  if (SU->isBottomReady())
    Bot.removeReady(SU);

  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                    << *SU->getInstr());

  assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
  return SU;
}

void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
    SchedBoundary &Zone, const CandPolicy &ZonePolicy,
    const RegPressureTracker &RPTracker, SchedCandidate &Cand,
    bool &PickedPending, bool IsBottomUp) {
  assert(Zone.isTop() && "coexec scheduler only supports top boundary");
  assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");

  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
  ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
  unsigned SGPRPressure = 0;
  unsigned VGPRPressure = 0;
  PickedPending = false;
  if (DAG->isTrackingPressure()) {
    if (!useGCNTrackers()) {
      SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
      VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
    } else {
      SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
      VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
    }
  }

  auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
    for (SUnit *SU : Q) {
      SchedCandidate TryCand(ZonePolicy);
      initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
                    VGPRPressure, IsBottomUp);
      SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
      tryCandidateCoexec(Cand, TryCand, ZoneArg);
      if (TryCand.Reason != NoCand) {
        if (TryCand.ResDelta == SchedResourceDelta())
          TryCand.initResourceDelta(Zone.DAG, SchedModel);
        LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
        PickedPending = FromPending;
        Cand.setBest(TryCand);
      } else {
        LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
      }
    }
  };

  LLVM_DEBUG(dbgs() << "Available Q:\n");
  EvaluateQueue(Zone.Available, /*FromPending=*/false);

  LLVM_DEBUG(dbgs() << "Pending Q:\n");
  EvaluateQueue(Zone.Pending, /*FromPending=*/true);
}

#ifndef NDEBUG
void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
                                                SchedCandidate &Cand) {
  const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
  unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();

  dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";

  const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
  dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
  SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
                        /*SkipDebugLoc=*/true);
  dbgs() << " [" << getFlavorName(Flavor) << "]\n";

  dbgs() << "  Reason: ";
  if (LastAMDGPUReason != AMDGPUSchedReason::None)
    dbgs() << getReasonName(LastAMDGPUReason);
  else if (Cand.Reason != NoCand)
    dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
  else
    dbgs() << "Unknown";
  dbgs() << "\n\n";

  LastAMDGPUReason = AMDGPUSchedReason::None;
}
#endif

bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
                                                   SchedCandidate &TryCand,
                                                   SchedBoundary *Zone) {
  // Initialize the candidate if needed.
  if (!Cand.isValid()) {
    TryCand.Reason = FirstValid;
    return true;
  }

  // Bias PhysReg Defs and copies to their uses and defined respectively.
  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
    return TryCand.Reason != NoCand;

  // Avoid exceeding the target's limit.
  if (DAG->isTrackingPressure() &&
      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
                  RegExcess, TRI, DAG->MF))
    return TryCand.Reason != NoCand;

  // We only compare a subset of features when comparing nodes between
  // Top and Bottom boundary. Some properties are simply incomparable, in many
  // other instances we should only override the other boundary if something
  // is a clear good pick on one boundary. Skip heuristics that are more
  // "tie-breaking" in nature.
  bool SameBoundary = Zone != nullptr;
  if (SameBoundary) {
    // Compare candidates by the stall they would introduce if
    // scheduled in the current cycle.
    if (tryEffectiveStall(Cand, TryCand, *Zone))
      return TryCand.Reason != NoCand;

    Heurs.sortHWUIResources();
    if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
      LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
      return TryCand.Reason != NoCand;
    }

    if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
      LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
      return TryCand.Reason != NoCand;
    }
  }

  // Keep clustered nodes together to encourage downstream peephole
  // optimizations which may reduce resource requirements.
  //
  // This is a best effort to set things up for a post-RA pass. Optimizations
  // like generating loads of multiple registers should ideally be done within
  // the scheduler pass by combining the loads during DAG postprocessing.
  unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
  unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
  bool CandIsClusterSucc =
      isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
  bool TryCandIsClusterSucc =
      isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);

  if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
                 Cluster))
    return TryCand.Reason != NoCand;

  if (SameBoundary) {
    // Weak edges are for clustering and other constraints.
    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
                getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
      return TryCand.Reason != NoCand;
  }

  // Avoid increasing the max pressure of the entire region.
  if (DAG->isTrackingPressure() &&
      tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
                  Cand, RegMax, TRI, DAG->MF))
    return TryCand.Reason != NoCand;

  if (SameBoundary) {
    // Avoid serializing long latency dependence chains.
    // For acyclic path limited loops, latency was already checked above.
    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
      return TryCand.Reason != NoCand;

    // Fall through to original instruction order.
    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
        (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
      TryCand.Reason = NodeOrder;
      return true;
    }
  }

  return false;
}

bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
                                                  SchedCandidate &TryCand,
                                                  SchedBoundary &Zone) const {
  // Treat structural and latency stalls as a single scheduling cost for the
  // current cycle.
  struct StallCosts {
    unsigned Ready = 0;
    unsigned Structural = 0;
    unsigned Latency = 0;
    unsigned Effective = 0;
  };

  unsigned CurrCycle = Zone.getCurrCycle();
  auto GetStallCosts = [&](SUnit *SU) {
    unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
    StallCosts Costs;
    Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
    Costs.Structural = getStructuralStallCycles(Zone, SU);
    Costs.Latency = Zone.getLatencyStallCycles(SU);
    Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
    return Costs;
  };

  StallCosts TryCosts = GetStallCosts(TryCand.SU);
  StallCosts CandCosts = GetStallCosts(Cand.SU);

  LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
    dbgs() << "Effective stalls: try=" << TryCosts.Effective
           << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
           << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
           << " (ready=" << CandCosts.Ready
           << ", struct=" << CandCosts.Structural
           << ", lat=" << CandCosts.Latency << ")\n";
  });

  return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
}

ScheduleDAGInstrs *
llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
  LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
                    << C->MF->getName() << '\n');
  return new GCNScheduleDAGMILive(
      C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
}

ScheduleDAGInstrs *
llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
  LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
                    << C->MF->getName() << '\n');
  return new GCNNoopPostScheduleDAG(C);
}