On GFX1250, V_NOPs inserted for WMMA coexecution hazards are placed at
the use-site. When the hazard-consuming instruction is inside a loop and
the WMMA is outside, these NOPs execute every iteration even though the
hazard only needs to be covered once.
This patch hoists the V_NOPs to the loop preheader, reducing executions
from N iterations to 1.
```
Example (assuming a hazard requiring K V_NOPs):
Before:
bb.0 (preheader): WMMA writes vgpr0
bb.1 (loop): V_NOP xK, VALU reads vgpr0, branch bb.1
-> K NOPs executed per iteration
After:
bb.0 (preheader): WMMA writes vgpr0, V_NOP xK
bb.1 (loop): VALU reads vgpr0, branch bb.1
-> K NOPs executed once
```
For nested loops, V_NOPs are hoisted to the outermost preheader where no
WMMA hazard exists within the loop.
Hoisting is restricted to strict preheaders (not any single predecessor)
to avoid introducing V_NOPs on unrelated control flow paths.
The optimization is controlled by `-amdgpu-wmma-vnop-hoisting` (default:
on).
Fixes: SWDEV-573407
121 lines
4.4 KiB
C++
121 lines
4.4 KiB
C++
//===----- PostRAHazardRecognizer.cpp - hazard recognizer -----------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This runs the hazard recognizer and emits noops when necessary. This
|
|
/// gives targets a way to run the hazard recognizer without running one of
|
|
/// the schedulers. Example use cases for this pass would be:
|
|
///
|
|
/// - Targets that need the hazard recognizer to be run at -O0.
|
|
/// - Targets that want to guarantee that hazards at the beginning of
|
|
/// scheduling regions are handled correctly. The post-RA scheduler is
|
|
/// a top-down scheduler, but when there are multiple scheduling regions
|
|
/// in a basic block, it visits the regions in bottom-up order. This
|
|
/// makes it impossible for the scheduler to gauranttee it can correctly
|
|
/// handle hazards at the beginning of scheduling regions.
|
|
///
|
|
/// This pass traverses all the instructions in a program in top-down order.
|
|
/// In contrast to the instruction scheduling passes, this pass never resets
|
|
/// the hazard recognizer to ensure it can correctly handles noop hazards at
|
|
/// the beginning of blocks.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineLoopInfo.h"
|
|
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
|
|
#include "llvm/CodeGen/TargetInstrInfo.h"
|
|
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "post-RA-hazard-rec"
|
|
|
|
STATISTIC(NumNoops, "Number of noops inserted");
|
|
|
|
namespace {
|
|
struct PostRAHazardRecognizer {
|
|
bool run(MachineFunction &MF, MachineLoopInfo *MLI);
|
|
};
|
|
|
|
class PostRAHazardRecognizerLegacy : public MachineFunctionPass {
|
|
|
|
public:
|
|
static char ID;
|
|
PostRAHazardRecognizerLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
AU.addRequired<MachineLoopInfoWrapperPass>();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &Fn) override {
|
|
MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
|
|
return PostRAHazardRecognizer().run(Fn, &MLI);
|
|
}
|
|
};
|
|
char PostRAHazardRecognizerLegacy::ID = 0;
|
|
|
|
} // namespace
|
|
|
|
char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizerLegacy::ID;
|
|
|
|
INITIALIZE_PASS_BEGIN(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
|
|
"Post RA hazard recognizer", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
|
|
INITIALIZE_PASS_END(PostRAHazardRecognizerLegacy, DEBUG_TYPE,
|
|
"Post RA hazard recognizer", false, false)
|
|
|
|
PreservedAnalyses
|
|
llvm::PostRAHazardRecognizerPass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
MachineLoopInfo *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
|
|
if (!PostRAHazardRecognizer().run(MF, MLI))
|
|
return PreservedAnalyses::all();
|
|
|
|
auto PA = getMachineFunctionPassPreservedAnalyses();
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|
|
|
|
bool PostRAHazardRecognizer::run(MachineFunction &Fn, MachineLoopInfo *MLI) {
|
|
const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
|
|
std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
|
|
TII->CreateTargetPostRAHazardRecognizer(Fn, MLI));
|
|
|
|
// Return if the target has not implemented a hazard recognizer.
|
|
if (!HazardRec)
|
|
return false;
|
|
|
|
// Loop over all of the basic blocks
|
|
bool Changed = false;
|
|
for (auto &MBB : Fn) {
|
|
// We do not call HazardRec->reset() here to make sure we are handling noop
|
|
// hazards at the start of basic blocks.
|
|
for (MachineInstr &MI : MBB) {
|
|
// If we need to emit noops prior to this instruction, then do so.
|
|
unsigned NumPreNoops = HazardRec->PreEmitNoops(&MI);
|
|
HazardRec->EmitNoops(NumPreNoops);
|
|
TII->insertNoops(MBB, MachineBasicBlock::iterator(MI), NumPreNoops);
|
|
NumNoops += NumPreNoops;
|
|
if (NumPreNoops)
|
|
Changed = true;
|
|
|
|
HazardRec->EmitInstruction(&MI);
|
|
if (HazardRec->atIssueLimit()) {
|
|
HazardRec->AdvanceCycle();
|
|
}
|
|
}
|
|
}
|
|
return Changed;
|
|
}
|