This is a relatively simple strategy as it is omitting any heuristics for liveness and register pressure reduction. This works well as the SystemZ ISel scheduler is using Sched::RegPressure which gives a good input order to begin with. It is trying harder with biasing phys regs than GenericScheduler as it also considers other instructions such as immediate loads directly into phys-regs produced by the register coalescer. This can hopefully be refactored into MachineScheduler.cpp. It has a latency heuristic that is slightly different from the one in GenericScheduler: It is activated for a specific type of region that have many "data sequences" consisting of SUs connected only with a single data-edge that are next to each other in the input order. This is only 3% of all the scheduling regions, but when activated it is applied on all the candidates (not just once per cycle). At the same time it is a bit more careful by checking not only the SU Height against the scheduled latency but also its Depth against the remaining latency. It reuses the GenericScheduler handling of weak edges to help copy coalescing. It also helps with compare zero elimination as it tries to put a CC-defining instruction that produces the compare source value above the compare before any other instruction clobbering CC or the value. This work was started after observing heavy spilling in Cactus, which was actually *caused* by GenericScheduler - disabling it (no pre-RA scheduling) remedied it and gave a 7% improvement in performance on that benchmark. Many different versions have been tried which has evolved into this initial simplistic MachineSchedStrategy that does relatively little and yet achieves double-digit improvements on Cactus and Imagick compared to GenericSched (which is OTOH 3% better on Blender). There will hopefully be more improvements added later on as there seems to be potential for it. It would be very interesting to have other OOO targets try this as well and perhaps make this available in MachineScheduler.cpp (A first attempt with improving the pre-RA scheduling was made with #90181, which however did not materialize in anything actually useful.)
190 lines
6.3 KiB
C++
190 lines
6.3 KiB
C++
//==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// -------------------------- Pre RA scheduling ----------------------------- //
|
|
//
|
|
// SystemZPreRASchedStrategy performs latency scheduling in certain types of
|
|
// regions where this is beneficial, and also helps copy coalescing and
|
|
// comparison elimination.
|
|
//
|
|
// -------------------------- Post RA scheduling ---------------------------- //
|
|
//
|
|
// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
|
|
// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
|
|
// implementation that looks to optimize decoder grouping and balance the
|
|
// usage of processor resources. Scheduler states are saved for the end
|
|
// region of each MBB, so that a successor block can learn from it.
|
|
//
|
|
//----------------------------------------------------------------------------//
|
|
|
|
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
|
|
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
|
|
|
|
#include "SystemZHazardRecognizer.h"
|
|
#include "llvm/CodeGen/MachineScheduler.h"
|
|
#include "llvm/CodeGen/ScheduleDAG.h"
|
|
#include <set>
|
|
|
|
namespace llvm {
|
|
|
|
/// A MachineSchedStrategy implementation for SystemZ pre RA scheduling.
|
|
class SystemZPreRASchedStrategy : public GenericScheduler {
|
|
void initializeLatencyReduction();
|
|
|
|
Register Cmp0SrcReg;
|
|
// Return true if MI defines the Cmp0SrcReg that is used by a scheduled
|
|
// compare with 0. If CCDef is true MI must also have an implicit def of CC.
|
|
bool definesCmp0Src(const MachineInstr *MI, bool CCDef = true) const;
|
|
|
|
// True if the region has many instructions in def-use sequences and would
|
|
// likely benefit from latency reduction.
|
|
bool HasDataSequences;
|
|
|
|
protected:
|
|
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
|
|
SchedBoundary *Zone) const override;
|
|
|
|
public:
|
|
SystemZPreRASchedStrategy(const MachineSchedContext *C)
|
|
: GenericScheduler(C) {}
|
|
|
|
void initPolicy(MachineBasicBlock::iterator Begin,
|
|
MachineBasicBlock::iterator End,
|
|
unsigned NumRegionInstrs) override;
|
|
void initialize(ScheduleDAGMI *dag) override;
|
|
void schedNode(SUnit *SU, bool IsTopNode) override;
|
|
};
|
|
|
|
/// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
|
|
class SystemZPostRASchedStrategy : public MachineSchedStrategy {
|
|
|
|
const MachineLoopInfo *MLI;
|
|
const SystemZInstrInfo *TII;
|
|
|
|
// A SchedModel is needed before any DAG is built while advancing past
|
|
// non-scheduled instructions, so it would not always be possible to call
|
|
// DAG->getSchedClass(SU).
|
|
TargetSchedModel SchedModel;
|
|
|
|
/// A candidate during instruction evaluation.
|
|
struct Candidate {
|
|
SUnit *SU = nullptr;
|
|
|
|
/// The decoding cost.
|
|
int GroupingCost = 0;
|
|
|
|
/// The processor resources cost.
|
|
int ResourcesCost = 0;
|
|
|
|
Candidate() = default;
|
|
Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
|
|
|
|
// Compare two candidates.
|
|
bool operator<(const Candidate &other);
|
|
|
|
// Check if this node is free of cost ("as good as any").
|
|
bool noCost() const {
|
|
return (GroupingCost <= 0 && !ResourcesCost);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
void dumpCosts() {
|
|
if (GroupingCost != 0)
|
|
dbgs() << " Grouping cost:" << GroupingCost;
|
|
if (ResourcesCost != 0)
|
|
dbgs() << " Resource cost:" << ResourcesCost;
|
|
}
|
|
#endif
|
|
};
|
|
|
|
// A sorter for the Available set that makes sure that SUs are considered
|
|
// in the best order.
|
|
struct SUSorter {
|
|
bool operator() (SUnit *lhs, SUnit *rhs) const {
|
|
if (lhs->isScheduleHigh && !rhs->isScheduleHigh)
|
|
return true;
|
|
if (!lhs->isScheduleHigh && rhs->isScheduleHigh)
|
|
return false;
|
|
|
|
if (lhs->getHeight() > rhs->getHeight())
|
|
return true;
|
|
else if (lhs->getHeight() < rhs->getHeight())
|
|
return false;
|
|
|
|
return (lhs->NodeNum < rhs->NodeNum);
|
|
}
|
|
};
|
|
// A set of SUs with a sorter and dump method.
|
|
struct SUSet : std::set<SUnit*, SUSorter> {
|
|
#ifndef NDEBUG
|
|
void dump(SystemZHazardRecognizer &HazardRec) const;
|
|
#endif
|
|
};
|
|
|
|
/// The set of available SUs to schedule next.
|
|
SUSet Available;
|
|
|
|
/// Current MBB
|
|
MachineBasicBlock *MBB;
|
|
|
|
/// Maintain hazard recognizers for all blocks, so that the scheduler state
|
|
/// can be maintained past BB boundaries when appropariate.
|
|
typedef std::map<MachineBasicBlock*, SystemZHazardRecognizer*> MBB2HazRec;
|
|
MBB2HazRec SchedStates;
|
|
|
|
/// Pointer to the HazardRecognizer that tracks the scheduler state for
|
|
/// the current region.
|
|
SystemZHazardRecognizer *HazardRec;
|
|
|
|
/// Update the scheduler state by emitting (non-scheduled) instructions
|
|
/// up to, but not including, NextBegin.
|
|
void advanceTo(MachineBasicBlock::iterator NextBegin);
|
|
|
|
public:
|
|
SystemZPostRASchedStrategy(const MachineSchedContext *C);
|
|
~SystemZPostRASchedStrategy() override;
|
|
|
|
/// Called for a region before scheduling.
|
|
void initPolicy(MachineBasicBlock::iterator Begin,
|
|
MachineBasicBlock::iterator End,
|
|
unsigned NumRegionInstrs) override;
|
|
|
|
/// PostRA scheduling does not track pressure.
|
|
bool shouldTrackPressure() const override { return false; }
|
|
|
|
// Process scheduling regions top-down so that scheduler states can be
|
|
// transferrred over scheduling boundaries.
|
|
bool doMBBSchedRegionsTopDown() const override { return true; }
|
|
|
|
void initialize(ScheduleDAGMI *dag) override;
|
|
|
|
/// Tell the strategy that MBB is about to be processed.
|
|
void enterMBB(MachineBasicBlock *NextMBB) override;
|
|
|
|
/// Tell the strategy that current MBB is done.
|
|
void leaveMBB() override;
|
|
|
|
/// Pick the next node to schedule, or return NULL.
|
|
SUnit *pickNode(bool &IsTopNode) override;
|
|
|
|
/// ScheduleDAGMI has scheduled an instruction - tell HazardRec
|
|
/// about it.
|
|
void schedNode(SUnit *SU, bool IsTopNode) override;
|
|
|
|
/// SU has had all predecessor dependencies resolved. Put it into
|
|
/// Available.
|
|
void releaseTopNode(SUnit *SU) override;
|
|
|
|
/// Currently only scheduling top-down, so this method is empty.
|
|
void releaseBottomNode(SUnit *SU) override {};
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
|