692 lines
24 KiB
C++
692 lines
24 KiB
C++
//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Lower VGPRs above first 256 on gfx1250.
|
|
///
|
|
/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
|
|
/// VGPR addressing mode. The mode change is effective until the next change.
|
|
/// This instruction provides high bits of a VGPR address for four of the
|
|
/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
|
|
/// instruction encoding. If bits are set they are added as MSB to the
|
|
/// corresponding operand VGPR number.
|
|
///
|
|
/// There is no need to replace actual register operands because encoding of the
|
|
/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
|
|
/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
|
|
/// VGPRs will survive until actual encoding and will result in a same actual
|
|
/// bit encoding.
|
|
///
|
|
/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
|
|
/// to a VGPR address of the subseqent instructions. The InstPrinter will take
|
|
/// care of the printing a low VGPR instead of a high one. In prinicple this
|
|
/// shall be viable to print actual high VGPR numbers, but that would disagree
|
|
/// with a disasm printing and create a situation where asm text is not
|
|
/// deterministic.
|
|
///
|
|
/// This pass creates a convention where non-fall through basic blocks shall
|
|
/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
|
|
/// An optimization here is possible but deemed not desirable because of the
|
|
/// readbility concerns.
|
|
///
|
|
/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
|
|
/// The pass must run very late in the pipeline to make sure no changes to VGPR
|
|
/// operands will be made after it.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPULowerVGPREncoding.h"
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "SIDefines.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "llvm/ADT/bit.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
|
|
|
|
namespace {
|
|
|
|
class AMDGPULowerVGPREncoding {
|
|
static constexpr unsigned OpNum = 4;
|
|
static constexpr unsigned BitsPerField = 2;
|
|
static constexpr unsigned NumFields = 4;
|
|
static constexpr unsigned ModeWidth = NumFields * BitsPerField;
|
|
static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
|
|
static constexpr unsigned VGPRMSBShift =
|
|
llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
|
|
|
|
struct OpMode {
|
|
// No MSBs set means they are not required to be of a particular value.
|
|
std::optional<unsigned> MSBits;
|
|
|
|
bool update(const OpMode &New, bool &Rewritten) {
|
|
bool Updated = false;
|
|
if (New.MSBits) {
|
|
if (*New.MSBits != MSBits.value_or(0)) {
|
|
Updated = true;
|
|
Rewritten |= MSBits.has_value();
|
|
}
|
|
MSBits = New.MSBits;
|
|
}
|
|
return Updated;
|
|
}
|
|
};
|
|
|
|
struct ModeTy {
|
|
OpMode Ops[OpNum];
|
|
|
|
bool update(const ModeTy &New, bool &Rewritten) {
|
|
bool Updated = false;
|
|
for (unsigned I : seq(OpNum))
|
|
Updated |= Ops[I].update(New.Ops[I], Rewritten);
|
|
return Updated;
|
|
}
|
|
|
|
unsigned encode() const {
|
|
// Layout: [src0 msb, src1 msb, src2 msb, dst msb].
|
|
unsigned V = 0;
|
|
for (const auto &[I, Op] : enumerate(Ops))
|
|
V |= Op.MSBits.value_or(0) << (I * 2);
|
|
return V;
|
|
}
|
|
|
|
void print(raw_ostream &OS) const {
|
|
static const char *FieldNames[] = {"src0", "src1", "src2", "dst"};
|
|
OS << '{';
|
|
for (const auto &[I, Op] : enumerate(Ops)) {
|
|
if (I)
|
|
OS << ", ";
|
|
OS << FieldNames[I] << '=';
|
|
if (Op.MSBits)
|
|
OS << *Op.MSBits;
|
|
else
|
|
OS << '?';
|
|
}
|
|
OS << '}';
|
|
}
|
|
|
|
// Check if this mode is compatible with required \p NewMode without
|
|
// modification.
|
|
bool isCompatible(const ModeTy NewMode) const {
|
|
for (unsigned I : seq(OpNum)) {
|
|
if (!NewMode.Ops[I].MSBits.has_value())
|
|
continue;
|
|
if (Ops[I].MSBits.value_or(0) != NewMode.Ops[I].MSBits.value_or(0))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
public:
|
|
bool run(MachineFunction &MF);
|
|
|
|
private:
|
|
const SIInstrInfo *TII;
|
|
const SIRegisterInfo *TRI;
|
|
|
|
// Current basic block.
|
|
MachineBasicBlock *MBB;
|
|
|
|
/// Most recent s_set_* instruction.
|
|
MachineInstr *MostRecentModeSet;
|
|
|
|
/// Current mode bits.
|
|
ModeTy CurrentMode;
|
|
|
|
/// Number of current hard clause instructions.
|
|
unsigned ClauseLen;
|
|
|
|
/// Number of hard clause instructions remaining.
|
|
unsigned ClauseRemaining;
|
|
|
|
/// Clause group breaks.
|
|
unsigned ClauseBreaks;
|
|
|
|
/// Last hard clause instruction.
|
|
MachineInstr *Clause;
|
|
|
|
// Remember whether XCNT is known to be zero because of an S_SET_VGPR_MSB
|
|
// instruction that we inserted, which implicitly waits for XCNT==0.
|
|
bool XCntIsZero;
|
|
|
|
/// Insert mode change before \p I. \returns true if mode was changed.
|
|
bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
|
|
|
|
/// Reset mode to default.
|
|
void resetMode(MachineBasicBlock::instr_iterator I) {
|
|
ModeTy Mode;
|
|
for (OpMode &Op : Mode.Ops)
|
|
Op.MSBits = 0;
|
|
setMode(Mode, I);
|
|
}
|
|
|
|
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
|
|
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
|
|
|
|
/// Handle single \p MI. \return true if changed.
|
|
bool runOnMachineInstr(MachineInstr &MI);
|
|
|
|
/// Compute the mode for a single \p MI given \p Ops operands
|
|
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
|
|
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
|
|
/// is checked.
|
|
void computeMode(ModeTy &NewMode, const MachineInstr &MI,
|
|
const AMDGPU::OpName Ops[OpNum],
|
|
const AMDGPU::OpName *Ops2 = nullptr);
|
|
|
|
/// Check if an instruction \p I is within a clause and returns a suitable
|
|
/// iterator to insert mode change. It may also modify the S_CLAUSE
|
|
/// instruction to extend it or drop the clause if it cannot be adjusted.
|
|
MachineBasicBlock::instr_iterator
|
|
handleClause(MachineBasicBlock::instr_iterator I);
|
|
|
|
/// Check if an instruction \p I is immediately after another program state
|
|
/// instruction which it cannot coissue with. If so, insert before that
|
|
/// instruction to encourage more coissuing.
|
|
MachineBasicBlock::instr_iterator
|
|
handleCoissue(MachineBasicBlock::instr_iterator I);
|
|
|
|
/// S_SET_VGPR_MSB immediately after S_SETREG_IMM32_B32 targeting MODE is
|
|
/// silently dropped on GFX1250. When set, the next S_SET_VGPR_MSB insertion
|
|
/// must be preceded by S_NOP to avoid the hazard.
|
|
bool needNopBeforeSetVGPRMSB(MachineBasicBlock::instr_iterator I);
|
|
|
|
/// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
|
|
/// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
|
|
/// the current mode. \returns true if the instruction was modified or a
|
|
/// new one was inserted.
|
|
bool handleSetregMode(MachineInstr &MI);
|
|
|
|
/// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
|
|
/// the VGPR MSB mode value. \returns true if the immediate was changed.
|
|
bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
|
|
};
|
|
|
|
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
|
|
MachineBasicBlock::instr_iterator I) {
|
|
LLVM_DEBUG({
|
|
dbgs() << " setMode: NewMode=";
|
|
NewMode.print(dbgs());
|
|
dbgs() << " CurrentMode=";
|
|
CurrentMode.print(dbgs());
|
|
dbgs() << " MostRecentModeSet=" << (MostRecentModeSet ? "yes" : "null");
|
|
if (I != MBB->instr_end())
|
|
dbgs() << " before: " << *I;
|
|
else
|
|
dbgs() << " at end\n";
|
|
});
|
|
|
|
// Record previous mode into high 8 bits of the immediate.
|
|
int64_t OldModeBits = CurrentMode.encode() << ModeWidth;
|
|
|
|
bool Rewritten = false;
|
|
if (!CurrentMode.update(NewMode, Rewritten)) {
|
|
LLVM_DEBUG(dbgs() << " -> no change needed\n");
|
|
return false;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << " Rewritten=" << Rewritten << " after update\n");
|
|
|
|
if (MostRecentModeSet && !Rewritten) {
|
|
// Update MostRecentModeSet with the new mode. It can be either
|
|
// S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
|
|
if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
|
|
MachineOperand &Op = MostRecentModeSet->getOperand(0);
|
|
// Carry old mode bits from the existing instruction.
|
|
int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
|
|
Op.setImm(CurrentMode.encode() | OldModeBits);
|
|
LLVM_DEBUG(dbgs() << " -> piggybacked onto S_SET_VGPR_MSB: "
|
|
<< *MostRecentModeSet);
|
|
} else {
|
|
assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
|
|
"unexpected MostRecentModeSet opcode");
|
|
updateSetregModeImm(*MostRecentModeSet, CurrentMode.encode());
|
|
LLVM_DEBUG(dbgs() << " -> piggybacked onto S_SETREG_IMM32_B32: "
|
|
<< *MostRecentModeSet);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
MachineBasicBlock::instr_iterator InsertPt = handleClause(I);
|
|
InsertPt = handleCoissue(InsertPt);
|
|
// Case 2 match in handleSetregMode: the setreg's imm[12:19] matched
|
|
// current MSBs, but the next VALU needs different MSBs, so this
|
|
// S_SET_VGPR_MSB would land right after the setreg. Insert S_NOP to
|
|
// prevent it from being silently dropped.
|
|
if (needNopBeforeSetVGPRMSB(I))
|
|
BuildMI(*MBB, InsertPt, {}, TII->get(AMDGPU::S_NOP)).addImm(0);
|
|
MostRecentModeSet =
|
|
BuildMI(*MBB, InsertPt, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
|
|
.addImm(NewMode.encode() | OldModeBits);
|
|
LLVM_DEBUG(dbgs() << " -> inserted new S_SET_VGPR_MSB: "
|
|
<< *MostRecentModeSet);
|
|
|
|
// If we inserted S_SET_VGPR_MSB early then XCNT should remain zero from the
|
|
// insertion point to the current instruction. Remove any redundant
|
|
// S_WAIT_XCNT instructions in that range.
|
|
for (MachineInstr &MI : make_early_inc_range(make_range(InsertPt, I))) {
|
|
assert(!SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isSMRD(MI));
|
|
if (MI.getOpcode() == AMDGPU::S_WAIT_XCNT)
|
|
MI.eraseFromBundle();
|
|
}
|
|
XCntIsZero = true;
|
|
|
|
CurrentMode = NewMode;
|
|
return true;
|
|
}
|
|
|
|
std::optional<unsigned>
|
|
AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
|
|
if (!MO.isReg())
|
|
return std::nullopt;
|
|
|
|
MCRegister Reg = MO.getReg();
|
|
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
|
|
if (!RC || !TRI->isVGPRClass(RC))
|
|
return std::nullopt;
|
|
|
|
unsigned Idx = TRI->getHWRegIndex(Reg);
|
|
return Idx >> 8;
|
|
}
|
|
|
|
void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
|
|
const MachineInstr &MI,
|
|
const AMDGPU::OpName Ops[OpNum],
|
|
const AMDGPU::OpName *Ops2) {
|
|
NewMode = {};
|
|
|
|
for (unsigned I = 0; I < OpNum; ++I) {
|
|
const MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
|
|
|
|
std::optional<unsigned> MSBits;
|
|
if (Op)
|
|
MSBits = getMSBs(*Op);
|
|
|
|
#if !defined(NDEBUG)
|
|
if (MSBits.has_value() && Ops2) {
|
|
const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]);
|
|
if (Op2) {
|
|
std::optional<unsigned> MSBits2;
|
|
MSBits2 = getMSBs(*Op2);
|
|
if (MSBits2.has_value() && MSBits != MSBits2)
|
|
llvm_unreachable("Invalid VOPD pair was created");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!MSBits.has_value() && Ops2) {
|
|
Op = TII->getNamedOperand(MI, Ops2[I]);
|
|
if (Op)
|
|
MSBits = getMSBs(*Op);
|
|
}
|
|
|
|
if (!MSBits.has_value())
|
|
continue;
|
|
|
|
// Skip tied uses of src2 of VOP2, these will be handled along with defs and
|
|
// only vdst bit affects these operands. We cannot skip tied uses of VOP3,
|
|
// these uses are real even if must match the vdst.
|
|
if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
|
|
(SIInstrInfo::isVOP2(MI) ||
|
|
(SIInstrInfo::isVOP3(MI) &&
|
|
TII->hasVALU32BitEncoding(MI.getOpcode()))))
|
|
continue;
|
|
|
|
NewMode.Ops[I].MSBits = MSBits.value();
|
|
}
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
|
|
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
|
|
if (Ops.first) {
|
|
ModeTy NewMode;
|
|
computeMode(NewMode, MI, Ops.first, Ops.second);
|
|
LLVM_DEBUG({
|
|
dbgs() << " runOnMachineInstr: ";
|
|
MI.print(dbgs());
|
|
dbgs() << " computed NewMode=";
|
|
NewMode.print(dbgs());
|
|
dbgs() << " compatible=" << CurrentMode.isCompatible(NewMode) << '\n';
|
|
});
|
|
if (!CurrentMode.isCompatible(NewMode) && MI.isCommutable() &&
|
|
TII->commuteInstruction(MI)) {
|
|
ModeTy NewModeCommuted;
|
|
computeMode(NewModeCommuted, MI, Ops.first, Ops.second);
|
|
LLVM_DEBUG({
|
|
dbgs() << " commuted NewMode=";
|
|
NewModeCommuted.print(dbgs());
|
|
dbgs() << " compatible=" << CurrentMode.isCompatible(NewModeCommuted)
|
|
<< '\n';
|
|
});
|
|
if (CurrentMode.isCompatible(NewModeCommuted)) {
|
|
// Update CurrentMode with mode bits the commuted instruction relies on.
|
|
// This prevents later instructions from piggybacking and corrupting
|
|
// those bits (e.g., a nullopt src treated as 0 could be overwritten).
|
|
bool Unused = false;
|
|
CurrentMode.update(NewModeCommuted, Unused);
|
|
// MI was modified by the commute above.
|
|
return true;
|
|
}
|
|
// Commute back.
|
|
if (!TII->commuteInstruction(MI))
|
|
llvm_unreachable("Failed to restore commuted instruction.");
|
|
}
|
|
return setMode(NewMode, MI.getIterator());
|
|
}
|
|
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
|
|
return false;
|
|
}
|
|
|
|
MachineBasicBlock::instr_iterator
|
|
AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
|
|
if (!ClauseRemaining)
|
|
return I;
|
|
|
|
// A clause cannot start with a special instruction, place it right before
|
|
// the clause.
|
|
if (ClauseRemaining == ClauseLen) {
|
|
I = Clause->getPrevNode()->getIterator();
|
|
assert(I->isBundle());
|
|
return I;
|
|
}
|
|
|
|
// If a clause defines breaks each group cannot start with a mode change.
|
|
// just drop the clause.
|
|
if (ClauseBreaks) {
|
|
Clause->eraseFromBundle();
|
|
ClauseRemaining = 0;
|
|
return I;
|
|
}
|
|
|
|
// Otherwise adjust a number of instructions in the clause if it fits.
|
|
// If it does not clause will just become shorter. Since the length
|
|
// recorded in the clause is one less, increment the length after the
|
|
// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
|
|
if (ClauseLen < 63)
|
|
Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
|
|
|
|
++ClauseLen;
|
|
|
|
return I;
|
|
}
|
|
|
|
MachineBasicBlock::instr_iterator
|
|
AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
|
|
// "Program State instructions" are instructions which are used to control
|
|
// operation of the GPU rather than performing arithmetic. Such instructions
|
|
// have different coissuing rules w.r.t s_set_vgpr_msb.
|
|
auto isProgramStateInstr = [this](MachineInstr *MI) {
|
|
unsigned Opc = MI->getOpcode();
|
|
return TII->isBarrier(Opc) || TII->isWaitcnt(Opc) ||
|
|
Opc == AMDGPU::S_DELAY_ALU;
|
|
};
|
|
|
|
while (I != MBB->begin()) {
|
|
auto Prev = std::prev(I);
|
|
if (!isProgramStateInstr(&*Prev))
|
|
return I;
|
|
I = Prev;
|
|
}
|
|
|
|
return I;
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::needNopBeforeSetVGPRMSB(
|
|
MachineBasicBlock::instr_iterator I) {
|
|
while (I != MBB->begin()) {
|
|
I = std::prev(I);
|
|
if (I->getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
|
|
MachineOperand *SIMM16Op =
|
|
TII->getNamedOperand(*I, AMDGPU::OpName::simm16);
|
|
auto [HwRegId, Offset, Size] =
|
|
AMDGPU::Hwreg::HwregEncoding::decode(SIMM16Op->getImm());
|
|
if (HwRegId == AMDGPU::Hwreg::ID_MODE)
|
|
return true;
|
|
}
|
|
if (!I->isMetaInstruction())
|
|
return false;
|
|
}
|
|
// FIXME: Return true if the previous MBB falls through and ends with
|
|
// S_SETREG_IMM32_B32.
|
|
return false;
|
|
}
|
|
|
|
/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
|
|
/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
|
|
/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
|
|
/// This is a left rotation by 2 bits on an 8-bit value.
|
|
static int64_t convertModeToSetregFormat(int64_t Mode) {
|
|
assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
|
|
return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
|
|
int64_t ModeValue) {
|
|
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
|
|
|
|
// Convert from S_SET_VGPR_MSB format to MODE register format
|
|
int64_t SetregMode = convertModeToSetregFormat(ModeValue);
|
|
|
|
MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
|
|
int64_t OldImm = ImmOp->getImm();
|
|
// Note that Offset is ignored for mode bits here.
|
|
int64_t NewImm =
|
|
(OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
|
|
ImmOp->setImm(NewImm);
|
|
return NewImm != OldImm;
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
|
|
using namespace AMDGPU::Hwreg;
|
|
|
|
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
|
|
"only S_SETREG_IMM32_B32 needs to be handled");
|
|
|
|
LLVM_DEBUG(dbgs() << " handleSetregMode: " << MI);
|
|
|
|
MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
|
|
assert(SIMM16Op && "SIMM16Op must be present");
|
|
|
|
auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
|
|
LLVM_DEBUG(dbgs() << " HwRegId=" << HwRegId << " Offset=" << Offset
|
|
<< " Size=" << Size << '\n');
|
|
if (HwRegId != ID_MODE) {
|
|
LLVM_DEBUG(dbgs() << " -> not ID_MODE, skipping\n");
|
|
return false;
|
|
}
|
|
|
|
int64_t ModeValue = CurrentMode.encode();
|
|
LLVM_DEBUG({
|
|
dbgs() << " CurrentMode=";
|
|
CurrentMode.print(dbgs());
|
|
dbgs() << " encoded=0x" << Twine::utohexstr(ModeValue)
|
|
<< " VGPRMSBShift=" << VGPRMSBShift << '\n';
|
|
});
|
|
|
|
// Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
|
|
// imm32[12:19] is unused, or Offset is zero and it is safe to set
|
|
// imm32[12:19] to the correct VGPR MSBs.
|
|
if (!Offset || Size <= VGPRMSBShift) {
|
|
// Set imm32[12:19] to the correct VGPR MSBs.
|
|
LLVM_DEBUG(dbgs() << " Case 1: Size(" << Size << ") <= VGPRMSBShift("
|
|
<< VGPRMSBShift
|
|
<< "), treating as mode scope boundary\n");
|
|
// This instruction is at the boundary of the old mode's control range.
|
|
// Reset CurrentMode so that the next setMode call can freely piggyback
|
|
// the required mode into bits[12:19] without triggering Rewritten.
|
|
MostRecentModeSet = &MI;
|
|
CurrentMode = {};
|
|
bool Changed = updateSetregModeImm(MI, 0);
|
|
LLVM_DEBUG(dbgs() << " -> reset CurrentMode, cleared bits[12:19]: "
|
|
<< MI);
|
|
return Changed;
|
|
}
|
|
|
|
// Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
|
|
// cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
|
|
// MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
|
|
// in S_SET_VGPR_MSB format, so we need to convert before comparing.
|
|
MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
|
|
assert(ImmOp && "ImmOp must be present");
|
|
int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
|
|
int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
|
|
LLVM_DEBUG(dbgs() << " Case 2: Size(" << Size << ") > VGPRMSBShift, "
|
|
<< "ImmBits12To19=0x" << Twine::utohexstr(ImmBits12To19)
|
|
<< " SetregModeValue=0x"
|
|
<< Twine::utohexstr(SetregModeValue) << '\n');
|
|
if (ImmBits12To19 == SetregModeValue) {
|
|
// Already correct, but we must invalidate MostRecentModeSet because this
|
|
// instruction will overwrite mode[12:19]. We can't update this instruction
|
|
// via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
|
|
// a new s_set_vgpr_msb will be inserted after this instruction.
|
|
MostRecentModeSet = nullptr;
|
|
LLVM_DEBUG(dbgs() << " -> bits[12:19] already correct, "
|
|
"invalidated MostRecentModeSet\n");
|
|
return false;
|
|
}
|
|
|
|
// imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
|
|
// the original instruction to restore the correct value. Insert S_NOP
|
|
// to avoid the GFX1250 hazard where S_SET_VGPR_MSB immediately after
|
|
// S_SETREG_IMM32_B32(MODE) is silently dropped.
|
|
MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
|
|
BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
|
|
MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
|
|
TII->get(AMDGPU::S_SET_VGPR_MSB))
|
|
.addImm(ModeValue | (ModeValue << ModeWidth));
|
|
LLVM_DEBUG(dbgs() << " -> inserted S_SET_VGPR_MSB after setreg: "
|
|
<< *MostRecentModeSet);
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
if (!ST.has1024AddressableVGPRs())
|
|
return false;
|
|
|
|
TII = ST.getInstrInfo();
|
|
TRI = ST.getRegisterInfo();
|
|
|
|
LLVM_DEBUG(dbgs() << "*** AMDGPULowerVGPREncoding on " << MF.getName()
|
|
<< " ***\n");
|
|
|
|
bool Changed = false;
|
|
ClauseLen = ClauseRemaining = 0;
|
|
CurrentMode = {};
|
|
for (auto &MBB : MF) {
|
|
MostRecentModeSet = nullptr;
|
|
XCntIsZero = false;
|
|
this->MBB = &MBB;
|
|
|
|
LLVM_DEBUG(dbgs() << "BB#" << MBB.getNumber() << ' ' << MBB.getName()
|
|
<< ":\n");
|
|
|
|
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
|
|
if (MI.isMetaInstruction())
|
|
continue;
|
|
|
|
if (MI.isTerminator() || MI.isCall()) {
|
|
LLVM_DEBUG(dbgs() << " terminator/call: " << MI);
|
|
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
|
|
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
|
|
CurrentMode = {};
|
|
else
|
|
resetMode(MI.getIterator());
|
|
continue;
|
|
}
|
|
|
|
if (MI.isInlineAsm()) {
|
|
LLVM_DEBUG(dbgs() << " inline asm: " << MI);
|
|
if (TII->hasVGPRUses(MI))
|
|
resetMode(MI.getIterator());
|
|
continue;
|
|
}
|
|
|
|
if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
|
|
assert(!ClauseRemaining && "Nested clauses are not supported");
|
|
ClauseLen = MI.getOperand(0).getImm();
|
|
ClauseBreaks = (ClauseLen >> 8) & 15;
|
|
ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
|
|
Clause = &MI;
|
|
LLVM_DEBUG(dbgs() << " clause: len=" << ClauseLen
|
|
<< " breaks=" << ClauseBreaks << '\n');
|
|
continue;
|
|
}
|
|
|
|
if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
|
|
ST.hasSetregVGPRMSBFixup()) {
|
|
Changed |= handleSetregMode(MI);
|
|
continue;
|
|
}
|
|
|
|
// If XCNT is known to be zero then any S_WAIT_XCNT instruction is
|
|
// redundant and can be removed.
|
|
if (MI.getOpcode() == AMDGPU::S_WAIT_XCNT && XCntIsZero) {
|
|
MI.eraseFromBundle();
|
|
Changed = true;
|
|
continue;
|
|
}
|
|
|
|
Changed |= runOnMachineInstr(MI);
|
|
|
|
// Any VMEM or SMEM instruction may increment XCNT.
|
|
if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSMRD(MI))
|
|
XCntIsZero = false;
|
|
|
|
if (ClauseRemaining)
|
|
--ClauseRemaining;
|
|
}
|
|
|
|
// Reset the mode if we are falling through.
|
|
LLVM_DEBUG(dbgs() << " end of BB, resetting mode\n");
|
|
resetMode(MBB.instr_end());
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
return AMDGPULowerVGPREncoding().run(MF);
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char AMDGPULowerVGPREncodingLegacy::ID = 0;
|
|
|
|
char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
|
|
"AMDGPU Lower VGPR Encoding", false, false)
|
|
|
|
PreservedAnalyses
|
|
AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
if (!AMDGPULowerVGPREncoding().run(MF))
|
|
return PreservedAnalyses::all();
|
|
|
|
return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
|
|
}
|