The test used to look all good, but actually not. The WeakVH just make itself null after the pointed value being replaced. So a zero value was used because VarIndex become null. The test checks looks all good. Actually only the WeakTrackingVH have the ability to be updated to new value. Change the test slightly to make that using zero index is wrong.
1803 lines
64 KiB
C++
1803 lines
64 KiB
C++
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Eliminates allocas by either converting them into vectors or by migrating
|
|
// them to local address space.
|
|
//
|
|
// Two passes are exposed by this file:
|
|
// - "promote-alloca-to-vector", which runs early in the pipeline and only
|
|
// promotes to vector. Promotion to vector is almost always profitable
|
|
// except when the alloca is too big and the promotion would result in
|
|
// very high register pressure.
|
|
// - "promote-alloca", which does both promotion to vector and LDS and runs
|
|
// much later in the pipeline. This runs after SROA because promoting to
|
|
// LDS is of course less profitable than getting rid of the alloca or
|
|
// vectorizing it, thus we only want to do it when the only alternative is
|
|
// lowering the alloca to stack.
|
|
//
|
|
// Note that both of them exist for the old and new PMs. The new PM passes are
|
|
// declared in AMDGPU.h and the legacy PM ones are declared here.s
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/Analysis/CaptureTracking.h"
|
|
#include "llvm/Analysis/InstSimplifyFolder.h"
|
|
#include "llvm/Analysis/InstructionSimplify.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Transforms/Utils/SSAUpdater.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-promote-alloca"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
static cl::opt<bool>
|
|
DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
|
|
cl::desc("Disable promote alloca to vector"),
|
|
cl::init(false));
|
|
|
|
static cl::opt<bool>
|
|
DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
|
|
cl::desc("Disable promote alloca to LDS"),
|
|
cl::init(false));
|
|
|
|
static cl::opt<unsigned> PromoteAllocaToVectorLimit(
|
|
"amdgpu-promote-alloca-to-vector-limit",
|
|
cl::desc("Maximum byte size to consider promote alloca to vector"),
|
|
cl::init(0));
|
|
|
|
static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
|
|
"amdgpu-promote-alloca-to-vector-max-regs",
|
|
cl::desc(
|
|
"Maximum vector size (in 32b registers) to use when promoting alloca"),
|
|
cl::init(32));
|
|
|
|
// Use up to 1/4 of available register budget for vectorization.
|
|
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
|
|
static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
|
|
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
|
|
cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
|
|
cl::init(4));
|
|
|
|
static cl::opt<unsigned>
|
|
LoopUserWeight("promote-alloca-vector-loop-user-weight",
|
|
cl::desc("The bonus weight of users of allocas within loop "
|
|
"when sorting profitable allocas"),
|
|
cl::init(4));
|
|
|
|
// We support vector indices of the form ((A * stride) >> shift) + B
|
|
// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
|
|
// parts are optional.
|
|
struct GEPToVectorIndex {
|
|
WeakTrackingVH VarIndex = nullptr; // defaults to 0
|
|
ConstantInt *VarMul = nullptr; // defaults to 1
|
|
ConstantInt *VarShift = nullptr; // defaults to 0
|
|
ConstantInt *ConstIndex = nullptr; // defaults to 0
|
|
Value *Full = nullptr;
|
|
};
|
|
|
|
struct MemTransferInfo {
|
|
ConstantInt *SrcIndex = nullptr;
|
|
ConstantInt *DestIndex = nullptr;
|
|
};
|
|
|
|
// Analysis for planning the different strategies of alloca promotion.
|
|
struct AllocaAnalysis {
|
|
AllocaInst *Alloca = nullptr;
|
|
DenseSet<Value *> Pointers;
|
|
SmallVector<Use *> Uses;
|
|
unsigned Score = 0;
|
|
bool HaveSelectOrPHI = false;
|
|
struct {
|
|
FixedVectorType *Ty = nullptr;
|
|
SmallVector<Instruction *> Worklist;
|
|
SmallVector<Instruction *> UsersToRemove;
|
|
MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
|
|
MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
|
|
} Vector;
|
|
struct {
|
|
bool Enable = false;
|
|
SmallVector<User *> Worklist;
|
|
} LDS;
|
|
|
|
explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
|
|
};
|
|
|
|
// Shared implementation which can do both promotion to vector and to LDS.
|
|
class AMDGPUPromoteAllocaImpl {
|
|
private:
|
|
const TargetMachine &TM;
|
|
LoopInfo &LI;
|
|
Module *Mod = nullptr;
|
|
const DataLayout *DL = nullptr;
|
|
|
|
// FIXME: This should be per-kernel.
|
|
uint32_t LocalMemLimit = 0;
|
|
uint32_t CurrentLocalMemUsage = 0;
|
|
unsigned MaxVGPRs;
|
|
unsigned VGPRBudgetRatio;
|
|
unsigned MaxVectorRegs;
|
|
|
|
bool IsAMDGCN = false;
|
|
bool IsAMDHSA = false;
|
|
|
|
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
|
|
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
|
|
|
|
bool collectAllocaUses(AllocaAnalysis &AA) const;
|
|
|
|
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
|
|
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
|
|
/// Returns true if both operands are derived from the same alloca. Val should
|
|
/// be the same value as one of the input operands of UseInst.
|
|
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
|
|
Instruction *UseInst, int OpIdx0,
|
|
int OpIdx1) const;
|
|
|
|
/// Check whether we have enough local memory for promotion.
|
|
bool hasSufficientLocalMem(const Function &F);
|
|
|
|
FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
|
|
void analyzePromoteToVector(AllocaAnalysis &AA) const;
|
|
void promoteAllocaToVector(AllocaAnalysis &AA);
|
|
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
|
|
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
|
|
SetVector<IntrinsicInst *> &DeferredIntrs);
|
|
void
|
|
finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
|
|
|
|
void scoreAlloca(AllocaAnalysis &AA) const;
|
|
|
|
void setFunctionLimits(const Function &F);
|
|
|
|
public:
|
|
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
|
|
|
|
const Triple &TT = TM.getTargetTriple();
|
|
IsAMDGCN = TT.isAMDGCN();
|
|
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
|
|
}
|
|
|
|
bool run(Function &F, bool PromoteToLDS);
|
|
};
|
|
|
|
// FIXME: This can create globals so should be a module pass.
|
|
class AMDGPUPromoteAlloca : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUPromoteAlloca() : FunctionPass(ID) {}
|
|
|
|
bool runOnFunction(Function &F) override {
|
|
if (skipFunction(F))
|
|
return false;
|
|
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
|
|
return AMDGPUPromoteAllocaImpl(
|
|
TPC->getTM<TargetMachine>(),
|
|
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
|
|
.run(F, /*PromoteToLDS*/ true);
|
|
return false;
|
|
}
|
|
|
|
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
|
FunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
};
|
|
|
|
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
|
|
const Function &F) {
|
|
if (!TM.getTargetTriple().isAMDGCN())
|
|
return 128;
|
|
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
|
|
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
|
|
// Temporarily check both the attribute and the subtarget feature, until the
|
|
// latter is removed.
|
|
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
|
|
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
|
|
|
|
unsigned MaxVGPRs = ST.getMaxNumVGPRs(
|
|
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
|
|
DynamicVGPRBlockSize);
|
|
|
|
// A non-entry function has only 32 caller preserved registers.
|
|
// Do not promote alloca which will force spilling unless we know the function
|
|
// will be inlined.
|
|
if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
|
|
!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
|
|
MaxVGPRs = std::min(MaxVGPRs, 32u);
|
|
return MaxVGPRs;
|
|
}
|
|
|
|
} // end anonymous namespace
|
|
|
|
char AMDGPUPromoteAlloca::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
|
|
"AMDGPU promote alloca to vector or LDS", false, false)
|
|
// Move LDS uses from functions to kernels before promote alloca for accurate
|
|
// estimation of LDS available
|
|
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
|
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
|
|
"AMDGPU promote alloca to vector or LDS", false, false)
|
|
|
|
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
|
|
|
|
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
|
|
FunctionAnalysisManager &AM) {
|
|
auto &LI = AM.getResult<LoopAnalysis>(F);
|
|
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
|
|
if (Changed) {
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|
|
return PreservedAnalyses::all();
|
|
}
|
|
|
|
PreservedAnalyses
|
|
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
|
|
auto &LI = AM.getResult<LoopAnalysis>(F);
|
|
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
|
|
if (Changed) {
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|
|
return PreservedAnalyses::all();
|
|
}
|
|
|
|
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
|
|
return new AMDGPUPromoteAlloca();
|
|
}
|
|
|
|
bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
|
|
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
|
|
LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
|
|
<< " " << *Inst << "\n");
|
|
return false;
|
|
};
|
|
|
|
SmallVector<Instruction *, 4> WorkList({AA.Alloca});
|
|
while (!WorkList.empty()) {
|
|
auto *Cur = WorkList.pop_back_val();
|
|
if (find(AA.Pointers, Cur) != AA.Pointers.end())
|
|
continue;
|
|
AA.Pointers.insert(Cur);
|
|
for (auto &U : Cur->uses()) {
|
|
auto *Inst = cast<Instruction>(U.getUser());
|
|
if (isa<StoreInst>(Inst)) {
|
|
if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
|
|
return RejectUser(Inst, "pointer escapes via store");
|
|
}
|
|
}
|
|
AA.Uses.push_back(&U);
|
|
|
|
if (isa<GetElementPtrInst>(U.getUser())) {
|
|
WorkList.push_back(Inst);
|
|
} else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
|
|
// Only promote a select if we know that the other select operand is
|
|
// from another pointer that will also be promoted.
|
|
if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
|
|
return RejectUser(Inst, "select from mixed objects");
|
|
WorkList.push_back(Inst);
|
|
AA.HaveSelectOrPHI = true;
|
|
} else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
|
|
// Repeat for phis.
|
|
|
|
// TODO: Handle more complex cases. We should be able to replace loops
|
|
// over arrays.
|
|
switch (Phi->getNumIncomingValues()) {
|
|
case 1:
|
|
break;
|
|
case 2:
|
|
if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
|
|
return RejectUser(Inst, "phi from mixed objects");
|
|
break;
|
|
default:
|
|
return RejectUser(Inst, "phi with too many operands");
|
|
}
|
|
|
|
WorkList.push_back(Inst);
|
|
AA.HaveSelectOrPHI = true;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
|
|
LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
|
|
unsigned Score = 0;
|
|
// Increment score by one for each user + a bonus for users within loops.
|
|
for (auto *U : AA.Uses) {
|
|
Instruction *Inst = cast<Instruction>(U->getUser());
|
|
if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
|
|
isa<PHINode>(Inst))
|
|
continue;
|
|
unsigned UserScore =
|
|
1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
|
|
LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
|
|
Score += UserScore;
|
|
}
|
|
LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
|
|
AA.Score = Score;
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
|
|
// Load per function limits, overriding with global options where appropriate.
|
|
// R600 register tuples/aliasing are fragile with large vector promotions so
|
|
// apply architecture specific limit here.
|
|
const int R600MaxVectorRegs = 16;
|
|
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
|
|
"amdgpu-promote-alloca-to-vector-max-regs",
|
|
IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
|
|
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
|
|
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
|
|
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
|
|
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
|
|
PromoteAllocaToVectorVGPRRatio);
|
|
if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
|
|
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
|
|
}
|
|
|
|
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
|
|
if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
|
|
return false;
|
|
|
|
Mod = F.getParent();
|
|
DL = &Mod->getDataLayout();
|
|
|
|
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
|
|
MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
|
|
setFunctionLimits(F);
|
|
|
|
unsigned VectorizationBudget =
|
|
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
|
|
: (MaxVGPRs * 32)) /
|
|
VGPRBudgetRatio;
|
|
|
|
std::vector<AllocaAnalysis> Allocas;
|
|
for (Instruction &I : F.getEntryBlock()) {
|
|
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
|
|
// Array allocations are probably not worth handling, since an allocation
|
|
// of the array type is the canonical form.
|
|
if (!AI->isStaticAlloca() || AI->isArrayAllocation())
|
|
continue;
|
|
|
|
LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
|
|
|
|
AllocaAnalysis AA{AI};
|
|
if (collectAllocaUses(AA)) {
|
|
analyzePromoteToVector(AA);
|
|
if (PromoteToLDS)
|
|
analyzePromoteToLDS(AA);
|
|
if (AA.Vector.Ty || AA.LDS.Enable) {
|
|
scoreAlloca(AA);
|
|
Allocas.push_back(std::move(AA));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
stable_sort(Allocas,
|
|
[](const auto &A, const auto &B) { return A.Score > B.Score; });
|
|
|
|
// clang-format off
|
|
LLVM_DEBUG(
|
|
dbgs() << "Sorted Worklist:\n";
|
|
for (const auto &AA : Allocas)
|
|
dbgs() << " " << *AA.Alloca << "\n";
|
|
);
|
|
// clang-format on
|
|
|
|
bool Changed = false;
|
|
SetVector<IntrinsicInst *> DeferredIntrs;
|
|
for (AllocaAnalysis &AA : Allocas) {
|
|
if (AA.Vector.Ty) {
|
|
std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
|
|
assert(Size); // Expected to succeed on non-array alloca.
|
|
const unsigned AllocaCost = Size->getFixedValue() * 8;
|
|
// First, check if we have enough budget to vectorize this alloca.
|
|
if (AllocaCost <= VectorizationBudget) {
|
|
promoteAllocaToVector(AA);
|
|
Changed = true;
|
|
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
|
|
"Underflow!");
|
|
VectorizationBudget -= AllocaCost;
|
|
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
|
|
<< VectorizationBudget << "\n");
|
|
continue;
|
|
} else {
|
|
LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
|
|
<< AllocaCost << ", budget:" << VectorizationBudget
|
|
<< "): " << *AA.Alloca << "\n");
|
|
}
|
|
}
|
|
|
|
if (AA.LDS.Enable &&
|
|
tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
|
|
Changed = true;
|
|
}
|
|
finishDeferredAllocaToLDSPromotion(DeferredIntrs);
|
|
|
|
// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
|
|
// dangling pointers. If we want to reuse it past this point, the loop above
|
|
// would need to be updated to remove successfully promoted allocas.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
// Checks if the instruction I is a memset user of the alloca AI that we can
|
|
// deal with. Currently, only non-volatile memsets that affect the whole alloca
|
|
// are handled.
|
|
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
|
|
const DataLayout &DL) {
|
|
using namespace PatternMatch;
|
|
// For now we only care about non-volatile memsets that affect the whole type
|
|
// (start at index 0 and fill the whole alloca).
|
|
//
|
|
// TODO: Now that we moved to PromoteAlloca we could handle any memsets
|
|
// (except maybe volatile ones?) - we just need to use shufflevector if it
|
|
// only affects a subset of the vector.
|
|
const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
|
|
return I->getOperand(0) == AI &&
|
|
match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
|
|
}
|
|
|
|
static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
|
|
IRBuilder<> B(Ptr->getContext());
|
|
|
|
Ptr = Ptr->stripPointerCasts();
|
|
if (Ptr == AA.Alloca)
|
|
return B.getInt32(0);
|
|
|
|
auto *GEP = cast<GetElementPtrInst>(Ptr);
|
|
auto I = AA.Vector.GEPVectorIdx.find(GEP);
|
|
assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
|
|
|
|
if (!I->second.Full) {
|
|
Value *Result = nullptr;
|
|
B.SetInsertPoint(GEP);
|
|
|
|
if (I->second.VarIndex) {
|
|
Result = I->second.VarIndex;
|
|
Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
|
|
|
|
if (I->second.VarMul)
|
|
Result = B.CreateMul(Result, I->second.VarMul);
|
|
|
|
if (I->second.VarShift)
|
|
Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true);
|
|
}
|
|
|
|
if (I->second.ConstIndex) {
|
|
if (Result)
|
|
Result = B.CreateAdd(Result, I->second.ConstIndex);
|
|
else
|
|
Result = I->second.ConstIndex;
|
|
}
|
|
|
|
if (!Result)
|
|
Result = B.getInt32(0);
|
|
|
|
I->second.Full = Result;
|
|
}
|
|
|
|
return I->second.Full;
|
|
}
|
|
|
|
static std::optional<GEPToVectorIndex>
|
|
computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
|
|
Type *VecElemTy, const DataLayout &DL) {
|
|
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
|
|
// helper.
|
|
LLVMContext &Ctx = GEP->getContext();
|
|
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
|
|
SmallMapVector<Value *, APInt, 4> VarOffsets;
|
|
APInt ConstOffset(BW, 0);
|
|
|
|
// Walk backwards through nested GEPs to collect both constant and variable
|
|
// offsets, so that nested vector GEP chains can be lowered in one step.
|
|
//
|
|
// Given this IR fragment as input:
|
|
//
|
|
// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
|
|
// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
|
|
// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
|
|
// %3 = load i32, ptr addrspace(5) %2, align 4
|
|
//
|
|
// Combine both GEP operations in a single pass, producing:
|
|
// BasePtr = %0
|
|
// ConstOffset = 4
|
|
// VarOffsets = { %j -> element_size(<2 x i32>) }
|
|
//
|
|
// That lets us emit a single buffer_load directly into a VGPR, without ever
|
|
// allocating scratch memory for the intermediate pointer.
|
|
Value *CurPtr = GEP;
|
|
while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
|
|
if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
|
|
return {};
|
|
|
|
// Move to the next outer pointer.
|
|
CurPtr = CurGEP->getPointerOperand();
|
|
}
|
|
|
|
assert(CurPtr == Alloca && "GEP not based on alloca");
|
|
|
|
int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
|
|
if (VarOffsets.size() > 1)
|
|
return {};
|
|
|
|
// We support vector indices of the form ((VarIndex * stride) >> shift) + B.
|
|
// IndexQuot represents B. Check that the constant offset is a multiple
|
|
// of the vector element size.
|
|
if (ConstOffset.srem(VecElemSize) != 0)
|
|
return {};
|
|
APInt IndexQuot = ConstOffset.sdiv(VecElemSize);
|
|
|
|
GEPToVectorIndex Result;
|
|
|
|
if (!ConstOffset.isZero())
|
|
Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
|
|
|
|
// If there are no variable offsets, only a constant offset, then we're done.
|
|
if (VarOffsets.empty())
|
|
return Result;
|
|
|
|
// Scale is the stride in the (A * stride) part. Check that there is only one
|
|
// variable offset and extract the scale factor.
|
|
const auto &VarOffset = VarOffsets.front();
|
|
auto ScaleOpt = VarOffset.second.tryZExtValue();
|
|
if (!ScaleOpt || *ScaleOpt == 0)
|
|
return {};
|
|
|
|
uint64_t Scale = *ScaleOpt;
|
|
Result.VarIndex = VarOffset.first;
|
|
auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
|
|
if (!OffsetType)
|
|
return {};
|
|
|
|
// The vector index for the variable part is: VarIndex * Scale / VecElemSize.
|
|
if (Scale >= (uint64_t)VecElemSize) {
|
|
if (Scale % VecElemSize != 0)
|
|
return {};
|
|
|
|
// Scale is a multiple of VecElemSize, so the index is just: VarIndex *
|
|
// (Scale / VecElemSize).
|
|
uint64_t VarMul = Scale / VecElemSize;
|
|
// Only the multiplier is needed.
|
|
if (VarMul != 1)
|
|
Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul));
|
|
} else {
|
|
if ((uint64_t)VecElemSize % Scale != 0)
|
|
return {};
|
|
|
|
// VecElemSize is a multiple of Scale, so the index is just: VarIndex /
|
|
// (VecElemSize / Scale).
|
|
uint64_t Divisor = VecElemSize / Scale;
|
|
// The divisor must be a power of 2 so we can use a right shift.
|
|
if (!isPowerOf2_64(Divisor))
|
|
return {};
|
|
|
|
// VarIndex must be known to be divisible by that divisor.
|
|
KnownBits KB = computeKnownBits(VarOffset.first, DL);
|
|
if (KB.countMinTrailingZeros() < Log2_64(Divisor))
|
|
return {};
|
|
|
|
Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor)));
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
/// Promotes a single user of the alloca to a vector form.
|
|
///
|
|
/// \param Inst Instruction to be promoted.
|
|
/// \param DL Module Data Layout.
|
|
/// \param AA Alloca Analysis.
|
|
/// \param VecStoreSize Size of \p VectorTy in bytes.
|
|
/// \param ElementSize Size of \p VectorTy element type in bytes.
|
|
/// \param CurVal Current value of the vector (e.g. last stored value)
|
|
/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
|
|
/// be promoted now. This happens when promoting requires \p
|
|
/// CurVal, but \p CurVal is nullptr.
|
|
/// \return the stored value if \p Inst would have written to the alloca, or
|
|
/// nullptr otherwise.
|
|
static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
|
|
AllocaAnalysis &AA,
|
|
unsigned VecStoreSize,
|
|
unsigned ElementSize,
|
|
function_ref<Value *()> GetCurVal) {
|
|
// Note: we use InstSimplifyFolder because it can leverage the DataLayout
|
|
// to do more folding, especially in the case of vector splats.
|
|
IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
|
|
InstSimplifyFolder(DL));
|
|
Builder.SetInsertPoint(Inst);
|
|
|
|
Type *VecEltTy = AA.Vector.Ty->getElementType();
|
|
|
|
switch (Inst->getOpcode()) {
|
|
case Instruction::Load: {
|
|
Value *CurVal = GetCurVal();
|
|
Value *Index =
|
|
calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA);
|
|
|
|
// We're loading the full vector.
|
|
Type *AccessTy = Inst->getType();
|
|
TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
|
|
if (Constant *CI = dyn_cast<Constant>(Index)) {
|
|
if (CI->isNullValue() && AccessSize == VecStoreSize) {
|
|
Inst->replaceAllUsesWith(
|
|
Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// Loading a subvector.
|
|
if (isa<FixedVectorType>(AccessTy)) {
|
|
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
|
|
const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
|
|
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
|
|
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
|
|
|
|
// If idx is dynamic, then sandwich load with bitcasts.
|
|
// ie. VectorTy SubVecTy AccessTy
|
|
// <64 x i8> -> <16 x i8> <8 x i16>
|
|
// <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
|
|
// Extracting subvector with dynamic index has very large expansion in
|
|
// the amdgpu backend. Limit to pow2.
|
|
FixedVectorType *VectorTy = AA.Vector.Ty;
|
|
TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
|
|
uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
|
|
bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
|
|
unsigned TotalNumElts = VectorTy->getNumElements();
|
|
bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
|
|
if (!isa<ConstantInt>(Index) &&
|
|
llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
|
|
IsProperlyDivisible && IsAlignedLoad) {
|
|
IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
|
|
const unsigned NewNumElts =
|
|
DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
|
|
const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
|
|
FixedVectorType *BitCastTy =
|
|
FixedVectorType::get(NewElemTy, NewNumElts);
|
|
Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
|
|
Value *NewIdx = Builder.CreateLShr(
|
|
Index, ConstantInt::get(Index->getType(), LShrAmt));
|
|
Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
|
|
Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
|
|
Inst->replaceAllUsesWith(BCOut);
|
|
return nullptr;
|
|
}
|
|
|
|
Value *SubVec = PoisonValue::get(SubVecTy);
|
|
for (unsigned K = 0; K < NumLoadedElts; ++K) {
|
|
Value *CurIdx =
|
|
Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
|
|
SubVec = Builder.CreateInsertElement(
|
|
SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
|
|
}
|
|
|
|
Inst->replaceAllUsesWith(
|
|
Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
|
|
return nullptr;
|
|
}
|
|
|
|
// We're loading one element.
|
|
Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
|
|
if (AccessTy != VecEltTy)
|
|
ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
|
|
|
|
Inst->replaceAllUsesWith(ExtractElement);
|
|
return nullptr;
|
|
}
|
|
case Instruction::Store: {
|
|
// For stores, it's a bit trickier and it depends on whether we're storing
|
|
// the full vector or not. If we're storing the full vector, we don't need
|
|
// to know the current value. If this is a store of a single element, we
|
|
// need to know the value.
|
|
StoreInst *SI = cast<StoreInst>(Inst);
|
|
Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
|
|
Value *Val = SI->getValueOperand();
|
|
|
|
// We're storing the full vector, we can handle this without knowing CurVal.
|
|
Type *AccessTy = Val->getType();
|
|
TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
|
|
if (Constant *CI = dyn_cast<Constant>(Index))
|
|
if (CI->isNullValue() && AccessSize == VecStoreSize)
|
|
return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
|
|
|
|
// Storing a subvector.
|
|
if (isa<FixedVectorType>(AccessTy)) {
|
|
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
|
|
const unsigned NumWrittenElts =
|
|
AccessSize / DL.getTypeStoreSize(VecEltTy);
|
|
const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
|
|
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
|
|
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
|
|
|
|
Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
|
|
Value *CurVec = GetCurVal();
|
|
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
|
|
K < NumElts; ++K) {
|
|
Value *CurIdx =
|
|
Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
|
|
CurVec = Builder.CreateInsertElement(
|
|
CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
|
|
}
|
|
return CurVec;
|
|
}
|
|
|
|
if (Val->getType() != VecEltTy)
|
|
Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
|
|
return Builder.CreateInsertElement(GetCurVal(), Val, Index);
|
|
}
|
|
case Instruction::Call: {
|
|
if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
|
|
// For memcpy, we need to know curval.
|
|
ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
|
|
unsigned NumCopied = Length->getZExtValue() / ElementSize;
|
|
MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
|
|
unsigned SrcBegin = TI->SrcIndex->getZExtValue();
|
|
unsigned DestBegin = TI->DestIndex->getZExtValue();
|
|
|
|
SmallVector<int> Mask;
|
|
for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
|
|
if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
|
|
Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
|
|
? SrcBegin++
|
|
: PoisonMaskElem);
|
|
} else {
|
|
Mask.push_back(Idx);
|
|
}
|
|
}
|
|
|
|
return Builder.CreateShuffleVector(GetCurVal(), Mask);
|
|
}
|
|
|
|
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
|
|
// For memset, we don't need to know the previous value because we
|
|
// currently only allow memsets that cover the whole alloca.
|
|
Value *Elt = MSI->getOperand(1);
|
|
const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
|
|
if (BytesPerElt > 1) {
|
|
Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
|
|
|
|
// If the element type of the vector is a pointer, we need to first cast
|
|
// to an integer, then use a PtrCast.
|
|
if (VecEltTy->isPointerTy()) {
|
|
Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
|
|
Elt = Builder.CreateBitCast(EltBytes, PtrInt);
|
|
Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
|
|
} else
|
|
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
|
|
}
|
|
|
|
return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
|
|
}
|
|
|
|
if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
|
|
if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
|
|
Intr->replaceAllUsesWith(
|
|
Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
|
|
DL.getTypeAllocSize(AA.Vector.Ty)));
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
llvm_unreachable("Unsupported call when promoting alloca to vector");
|
|
}
|
|
|
|
default:
|
|
llvm_unreachable("Inconsistency in instructions promotable to vector");
|
|
}
|
|
|
|
llvm_unreachable("Did not return after promoting instruction!");
|
|
}
|
|
|
|
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
|
|
const DataLayout &DL) {
|
|
// Access as a vector type can work if the size of the access vector is a
|
|
// multiple of the size of the alloca's vector element type.
|
|
//
|
|
// Examples:
|
|
// - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
|
|
// - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
|
|
// - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
|
|
// - 3*32 is not a multiple of 64
|
|
//
|
|
// We could handle more complicated cases, but it'd make things a lot more
|
|
// complicated.
|
|
if (isa<FixedVectorType>(AccessTy)) {
|
|
TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
|
|
// If the type size and the store size don't match, we would need to do more
|
|
// than just bitcast to translate between an extracted/insertable subvectors
|
|
// and the accessed value.
|
|
if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
|
|
return false;
|
|
TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
|
|
return AccTS.isKnownMultipleOf(VecTS);
|
|
}
|
|
|
|
return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy,
|
|
DL);
|
|
}
|
|
|
|
/// Iterates over an instruction worklist that may contain multiple instructions
|
|
/// from the same basic block, but in a different order.
|
|
template <typename InstContainer>
|
|
static void forEachWorkListItem(const InstContainer &WorkList,
|
|
std::function<void(Instruction *)> Fn) {
|
|
// Bucket up uses of the alloca by the block they occur in.
|
|
// This is important because we have to handle multiple defs/uses in a block
|
|
// ourselves: SSAUpdater is purely for cross-block references.
|
|
DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
|
|
for (Instruction *User : WorkList)
|
|
UsesByBlock[User->getParent()].insert(User);
|
|
|
|
for (Instruction *User : WorkList) {
|
|
BasicBlock *BB = User->getParent();
|
|
auto &BlockUses = UsesByBlock[BB];
|
|
|
|
// Already processed, skip.
|
|
if (BlockUses.empty())
|
|
continue;
|
|
|
|
// Only user in the block, directly process it.
|
|
if (BlockUses.size() == 1) {
|
|
Fn(User);
|
|
continue;
|
|
}
|
|
|
|
// Multiple users in the block, do a linear scan to see users in order.
|
|
for (Instruction &Inst : *BB) {
|
|
if (!BlockUses.contains(&Inst))
|
|
continue;
|
|
|
|
Fn(&Inst);
|
|
}
|
|
|
|
// Clear the block so we know it's been processed.
|
|
BlockUses.clear();
|
|
}
|
|
}
|
|
|
|
/// Find an insert point after an alloca, after all other allocas clustered at
|
|
/// the start of the block.
|
|
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
|
|
BasicBlock::iterator I) {
|
|
for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
|
|
;
|
|
return I;
|
|
}
|
|
|
|
FixedVectorType *
|
|
AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
|
|
if (DisablePromoteAllocaToVector) {
|
|
LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
|
|
return nullptr;
|
|
}
|
|
|
|
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
|
|
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
|
|
uint64_t NumElems = 1;
|
|
Type *ElemTy;
|
|
do {
|
|
NumElems *= ArrayTy->getNumElements();
|
|
ElemTy = ArrayTy->getElementType();
|
|
} while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
|
|
|
|
// Check for array of vectors
|
|
auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
|
|
if (InnerVectorTy) {
|
|
NumElems *= InnerVectorTy->getNumElements();
|
|
ElemTy = InnerVectorTy->getElementType();
|
|
}
|
|
|
|
if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
|
|
unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
|
|
if (ElementSize > 0) {
|
|
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
|
|
// Expand vector if required to match padding of inner type,
|
|
// i.e. odd size subvectors.
|
|
// Storage size of new vector must match that of alloca for correct
|
|
// behaviour of byte offsets and GEP computation.
|
|
if (NumElems * ElementSize != AllocaSize)
|
|
NumElems = AllocaSize / ElementSize;
|
|
if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
|
|
VectorTy = FixedVectorType::get(ElemTy, NumElems);
|
|
}
|
|
}
|
|
}
|
|
if (!VectorTy) {
|
|
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
|
|
return nullptr;
|
|
}
|
|
|
|
const unsigned MaxElements =
|
|
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
|
|
|
|
if (VectorTy->getNumElements() > MaxElements ||
|
|
VectorTy->getNumElements() < 2) {
|
|
LLVM_DEBUG(dbgs() << " " << *VectorTy
|
|
<< " has an unsupported number of elements\n");
|
|
return nullptr;
|
|
}
|
|
|
|
Type *VecEltTy = VectorTy->getElementType();
|
|
unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
|
|
if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
|
|
LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
|
|
"does not match the type's size\n");
|
|
return nullptr;
|
|
}
|
|
|
|
return VectorTy;
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
|
|
if (AA.HaveSelectOrPHI) {
|
|
LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
|
|
return;
|
|
}
|
|
|
|
Type *AllocaTy = AA.Alloca->getAllocatedType();
|
|
AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
|
|
if (!AA.Vector.Ty)
|
|
return;
|
|
|
|
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
|
|
LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
|
|
<< " " << *Inst << "\n");
|
|
AA.Vector.Ty = nullptr;
|
|
};
|
|
|
|
Type *VecEltTy = AA.Vector.Ty->getElementType();
|
|
unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
|
|
assert(ElementSize > 0);
|
|
for (auto *U : AA.Uses) {
|
|
Instruction *Inst = cast<Instruction>(U->getUser());
|
|
|
|
if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
|
|
assert(!isa<StoreInst>(Inst) ||
|
|
U->getOperandNo() == StoreInst::getPointerOperandIndex());
|
|
|
|
Type *AccessTy = getLoadStoreType(Inst);
|
|
if (AccessTy->isAggregateType())
|
|
return RejectUser(Inst, "unsupported load/store as aggregate");
|
|
assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
|
|
|
|
// Check that this is a simple access of a vector element.
|
|
bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
|
|
: cast<StoreInst>(Inst)->isSimple();
|
|
if (!IsSimple)
|
|
return RejectUser(Inst, "not a simple load or store");
|
|
|
|
Ptr = Ptr->stripPointerCasts();
|
|
|
|
// Alloca already accessed as vector.
|
|
if (Ptr == AA.Alloca &&
|
|
DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
|
|
DL->getTypeStoreSize(AccessTy)) {
|
|
AA.Vector.Worklist.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
|
|
return RejectUser(Inst, "not a supported access type");
|
|
|
|
AA.Vector.Worklist.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
|
|
// If we can't compute a vector index from this GEP, then we can't
|
|
// promote this alloca to vector.
|
|
auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
|
|
if (!Index)
|
|
return RejectUser(Inst, "cannot compute vector index for GEP");
|
|
|
|
AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
|
|
AA.Vector.UsersToRemove.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
|
|
MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
|
|
AA.Vector.Worklist.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
|
|
if (TransferInst->isVolatile())
|
|
return RejectUser(Inst, "mem transfer inst is volatile");
|
|
|
|
ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
|
|
if (!Len || (Len->getZExtValue() % ElementSize))
|
|
return RejectUser(Inst, "mem transfer inst length is non-constant or "
|
|
"not a multiple of the vector element size");
|
|
|
|
auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
|
|
if (Ptr == AA.Alloca)
|
|
return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
|
|
|
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
|
|
const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
|
|
if (GEPI.VarIndex)
|
|
return nullptr;
|
|
if (GEPI.ConstIndex)
|
|
return GEPI.ConstIndex;
|
|
return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
|
|
};
|
|
|
|
MemTransferInfo *TI =
|
|
&AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
|
|
unsigned OpNum = U->getOperandNo();
|
|
if (OpNum == 0) {
|
|
Value *Dest = TransferInst->getDest();
|
|
ConstantInt *Index = getConstIndexIntoAlloca(Dest);
|
|
if (!Index)
|
|
return RejectUser(Inst, "could not calculate constant dest index");
|
|
TI->DestIndex = Index;
|
|
} else {
|
|
assert(OpNum == 1);
|
|
Value *Src = TransferInst->getSource();
|
|
ConstantInt *Index = getConstIndexIntoAlloca(Src);
|
|
if (!Index)
|
|
return RejectUser(Inst, "could not calculate constant src index");
|
|
TI->SrcIndex = Index;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
|
|
if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
|
|
AA.Vector.Worklist.push_back(Inst);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Ignore assume-like intrinsics and comparisons used in assumes.
|
|
if (isAssumeLikeIntrinsic(Inst)) {
|
|
if (!Inst->use_empty())
|
|
return RejectUser(Inst, "assume-like intrinsic cannot have any users");
|
|
AA.Vector.UsersToRemove.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
|
|
return isAssumeLikeIntrinsic(cast<Instruction>(U));
|
|
})) {
|
|
AA.Vector.UsersToRemove.push_back(Inst);
|
|
continue;
|
|
}
|
|
|
|
return RejectUser(Inst, "unhandled alloca user");
|
|
}
|
|
|
|
// Follow-up check to ensure we've seen both sides of all transfer insts.
|
|
for (const auto &Entry : AA.Vector.TransferInfo) {
|
|
const MemTransferInfo &TI = Entry.second;
|
|
if (!TI.SrcIndex || !TI.DestIndex)
|
|
return RejectUser(Entry.first,
|
|
"mem transfer inst between different objects");
|
|
AA.Vector.Worklist.push_back(Entry.first);
|
|
}
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
|
|
LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
|
|
LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
|
|
<< " -> " << *AA.Vector.Ty << '\n');
|
|
const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
|
|
|
|
Type *VecEltTy = AA.Vector.Ty->getElementType();
|
|
const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
|
|
|
|
// Alloca is uninitialized memory. Imitate that by making the first value
|
|
// undef.
|
|
SSAUpdater Updater;
|
|
Updater.Initialize(AA.Vector.Ty, "promotealloca");
|
|
|
|
BasicBlock *EntryBB = AA.Alloca->getParent();
|
|
BasicBlock::iterator InitInsertPos =
|
|
skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
|
|
IRBuilder<> Builder(&*InitInsertPos);
|
|
Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
|
|
AllocaInitValue->takeName(AA.Alloca);
|
|
|
|
Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
|
|
|
|
// First handle the initial worklist, in basic block order.
|
|
//
|
|
// Insert a placeholder whenever we need the vector value at the top of a
|
|
// basic block.
|
|
SmallSetVector<Instruction *, 8> Placeholders;
|
|
forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
|
|
BasicBlock *BB = I->getParent();
|
|
auto GetCurVal = [&]() -> Value * {
|
|
if (Value *CurVal = Updater.FindValueForBlock(BB))
|
|
return CurVal;
|
|
|
|
if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
|
|
return Placeholders.back();
|
|
|
|
// If the current value in the basic block is not yet known, insert a
|
|
// placeholder that we will replace later.
|
|
IRBuilder<> Builder(I);
|
|
auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
|
|
PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
|
|
Placeholders.insert(Placeholder);
|
|
return Placeholders.back();
|
|
};
|
|
|
|
Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
|
|
ElementSize, GetCurVal);
|
|
// If the returned result is a placeholder, it means the instruction does
|
|
// not really modify the alloca. So no need to make it being available value
|
|
// to SSAUpdater.
|
|
// This will stop placeholder being cached in SSAUpdater. The cached
|
|
// placeholder may cause stale pointer being referenced when doing
|
|
// placeholder replacement.
|
|
if (Result && (!isa<Instruction>(Result) ||
|
|
!Placeholders.contains(cast<Instruction>(Result))))
|
|
Updater.AddAvailableValue(BB, Result);
|
|
});
|
|
|
|
// Now fixup the placeholders.
|
|
for (Instruction *Placeholder : Placeholders) {
|
|
Placeholder->replaceAllUsesWith(
|
|
Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
|
|
Placeholder->eraseFromParent();
|
|
}
|
|
|
|
// Delete all instructions.
|
|
for (Instruction *I : AA.Vector.Worklist) {
|
|
assert(I->use_empty());
|
|
I->eraseFromParent();
|
|
}
|
|
|
|
// Delete all the users that are known to be removeable.
|
|
for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
|
|
I->dropDroppableUses();
|
|
assert(I->use_empty());
|
|
I->eraseFromParent();
|
|
}
|
|
|
|
// Alloca should now be dead too.
|
|
assert(AA.Alloca->use_empty());
|
|
AA.Alloca->eraseFromParent();
|
|
}
|
|
|
|
std::pair<Value *, Value *>
|
|
AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|
Function &F = *Builder.GetInsertBlock()->getParent();
|
|
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
|
|
|
if (!IsAMDHSA) {
|
|
CallInst *LocalSizeY =
|
|
Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
|
|
CallInst *LocalSizeZ =
|
|
Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
|
|
|
|
ST.makeLIDRangeMetadata(LocalSizeY);
|
|
ST.makeLIDRangeMetadata(LocalSizeZ);
|
|
|
|
return std::pair(LocalSizeY, LocalSizeZ);
|
|
}
|
|
|
|
// We must read the size out of the dispatch pointer.
|
|
assert(IsAMDGCN);
|
|
|
|
// We are indexing into this struct, and want to extract the workgroup_size_*
|
|
// fields.
|
|
//
|
|
// typedef struct hsa_kernel_dispatch_packet_s {
|
|
// uint16_t header;
|
|
// uint16_t setup;
|
|
// uint16_t workgroup_size_x ;
|
|
// uint16_t workgroup_size_y;
|
|
// uint16_t workgroup_size_z;
|
|
// uint16_t reserved0;
|
|
// uint32_t grid_size_x ;
|
|
// uint32_t grid_size_y ;
|
|
// uint32_t grid_size_z;
|
|
//
|
|
// uint32_t private_segment_size;
|
|
// uint32_t group_segment_size;
|
|
// uint64_t kernel_object;
|
|
//
|
|
// #ifdef HSA_LARGE_MODEL
|
|
// void *kernarg_address;
|
|
// #elif defined HSA_LITTLE_ENDIAN
|
|
// void *kernarg_address;
|
|
// uint32_t reserved1;
|
|
// #else
|
|
// uint32_t reserved1;
|
|
// void *kernarg_address;
|
|
// #endif
|
|
// uint64_t reserved2;
|
|
// hsa_signal_t completion_signal; // uint64_t wrapper
|
|
// } hsa_kernel_dispatch_packet_t
|
|
//
|
|
CallInst *DispatchPtr =
|
|
Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
|
|
DispatchPtr->addRetAttr(Attribute::NoAlias);
|
|
DispatchPtr->addRetAttr(Attribute::NonNull);
|
|
F.removeFnAttr("amdgpu-no-dispatch-ptr");
|
|
|
|
// Size of the dispatch packet struct.
|
|
DispatchPtr->addDereferenceableRetAttr(64);
|
|
|
|
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
|
|
|
|
// We could do a single 64-bit load here, but it's likely that the basic
|
|
// 32-bit and extract sequence is already present, and it is probably easier
|
|
// to CSE this. The loads should be mergeable later anyway.
|
|
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
|
|
LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
|
|
|
|
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
|
|
LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
|
|
|
|
MDNode *MD = MDNode::get(Mod->getContext(), {});
|
|
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
ST.makeLIDRangeMetadata(LoadZU);
|
|
|
|
// Extract y component. Upper half of LoadZU should be zero already.
|
|
Value *Y = Builder.CreateLShr(LoadXY, 16);
|
|
|
|
return std::pair(Y, LoadZU);
|
|
}
|
|
|
|
Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
|
|
unsigned N) {
|
|
Function *F = Builder.GetInsertBlock()->getParent();
|
|
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
|
|
Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
|
|
StringRef AttrName;
|
|
|
|
switch (N) {
|
|
case 0:
|
|
IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
|
|
: (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
|
|
AttrName = "amdgpu-no-workitem-id-x";
|
|
break;
|
|
case 1:
|
|
IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
|
|
: (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
|
|
AttrName = "amdgpu-no-workitem-id-y";
|
|
break;
|
|
|
|
case 2:
|
|
IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
|
|
: (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
|
|
AttrName = "amdgpu-no-workitem-id-z";
|
|
break;
|
|
default:
|
|
llvm_unreachable("invalid dimension");
|
|
}
|
|
|
|
Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
|
|
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
|
|
ST.makeLIDRangeMetadata(CI);
|
|
F->removeFnAttr(AttrName);
|
|
|
|
return CI;
|
|
}
|
|
|
|
static bool isCallPromotable(CallInst *CI) {
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
|
|
if (!II)
|
|
return false;
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
case Intrinsic::memcpy:
|
|
case Intrinsic::memmove:
|
|
case Intrinsic::memset:
|
|
case Intrinsic::lifetime_start:
|
|
case Intrinsic::lifetime_end:
|
|
case Intrinsic::invariant_start:
|
|
case Intrinsic::invariant_end:
|
|
case Intrinsic::launder_invariant_group:
|
|
case Intrinsic::strip_invariant_group:
|
|
case Intrinsic::objectsize:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
|
|
Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
|
|
int OpIdx1) const {
|
|
// Figure out which operand is the one we might not be promoting.
|
|
Value *OtherOp = Inst->getOperand(OpIdx0);
|
|
if (Val == OtherOp)
|
|
OtherOp = Inst->getOperand(OpIdx1);
|
|
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(OtherOp))
|
|
return true;
|
|
|
|
// TODO: getUnderlyingObject will not work on a vector getelementptr
|
|
Value *OtherObj = getUnderlyingObject(OtherOp);
|
|
if (!isa<AllocaInst>(OtherObj))
|
|
return false;
|
|
|
|
// TODO: We should be able to replace undefs with the right pointer type.
|
|
|
|
// TODO: If we know the other base object is another promotable
|
|
// alloca, not necessarily this alloca, we can do this. The
|
|
// important part is both must have the same address space at
|
|
// the end.
|
|
if (OtherObj != BaseAlloca) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "Found a binary instruction with another alloca object\n");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
|
|
if (DisablePromoteAllocaToLDS) {
|
|
LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
|
|
return;
|
|
}
|
|
|
|
// Don't promote the alloca to LDS for shader calling conventions as the work
|
|
// item ID intrinsics are not supported for these calling conventions.
|
|
// Furthermore not all LDS is available for some of the stages.
|
|
const Function &ContainingFunction = *AA.Alloca->getFunction();
|
|
CallingConv::ID CC = ContainingFunction.getCallingConv();
|
|
|
|
switch (CC) {
|
|
case CallingConv::AMDGPU_KERNEL:
|
|
case CallingConv::SPIR_KERNEL:
|
|
break;
|
|
default:
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< " promote alloca to LDS not supported with calling convention.\n");
|
|
return;
|
|
}
|
|
|
|
for (Use *Use : AA.Uses) {
|
|
auto *User = Use->getUser();
|
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(User)) {
|
|
if (!isCallPromotable(CI))
|
|
return;
|
|
|
|
if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
|
|
AA.LDS.Worklist.push_back(User);
|
|
continue;
|
|
}
|
|
|
|
Instruction *UseInst = cast<Instruction>(User);
|
|
if (UseInst->getOpcode() == Instruction::PtrToInt)
|
|
return;
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
|
|
if (LI->isVolatile())
|
|
return;
|
|
continue;
|
|
}
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
|
|
if (SI->isVolatile())
|
|
return;
|
|
continue;
|
|
}
|
|
|
|
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
|
|
if (RMW->isVolatile())
|
|
return;
|
|
continue;
|
|
}
|
|
|
|
if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
|
|
if (CAS->isVolatile())
|
|
return;
|
|
continue;
|
|
}
|
|
|
|
// Only promote a select if we know that the other select operand
|
|
// is from another pointer that will also be promoted.
|
|
if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
|
|
if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
|
|
return;
|
|
|
|
// May need to rewrite constant operands.
|
|
if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
|
|
AA.LDS.Worklist.push_back(ICmp);
|
|
continue;
|
|
}
|
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
|
|
// Be conservative if an address could be computed outside the bounds of
|
|
// the alloca.
|
|
if (!GEP->isInBounds())
|
|
return;
|
|
} else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) {
|
|
// Do not promote vector/aggregate type instructions. It is hard to track
|
|
// their users.
|
|
|
|
// Do not promote addrspacecast.
|
|
//
|
|
// TODO: If we know the address is only observed through flat pointers, we
|
|
// could still promote.
|
|
return;
|
|
}
|
|
|
|
if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
|
|
AA.LDS.Worklist.push_back(User);
|
|
}
|
|
|
|
AA.LDS.Enable = true;
|
|
}
|
|
|
|
bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
|
|
|
|
FunctionType *FTy = F.getFunctionType();
|
|
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
|
|
|
// If the function has any arguments in the local address space, then it's
|
|
// possible these arguments require the entire local memory space, so
|
|
// we cannot use local memory in the pass.
|
|
for (Type *ParamTy : FTy->params()) {
|
|
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
|
|
if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
LocalMemLimit = 0;
|
|
LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
|
|
"local memory disabled.\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
LocalMemLimit = ST.getAddressableLocalMemorySize();
|
|
if (LocalMemLimit == 0)
|
|
return false;
|
|
|
|
SmallVector<const Constant *, 16> Stack;
|
|
SmallPtrSet<const Constant *, 8> VisitedConstants;
|
|
SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
|
|
|
|
auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
|
|
for (const User *U : Val->users()) {
|
|
if (const Instruction *Use = dyn_cast<Instruction>(U)) {
|
|
if (Use->getFunction() == &F)
|
|
return true;
|
|
} else {
|
|
const Constant *C = cast<Constant>(U);
|
|
if (VisitedConstants.insert(C).second)
|
|
Stack.push_back(C);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
for (GlobalVariable &GV : Mod->globals()) {
|
|
if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
|
|
continue;
|
|
|
|
if (visitUsers(&GV, &GV)) {
|
|
UsedLDS.insert(&GV);
|
|
Stack.clear();
|
|
continue;
|
|
}
|
|
|
|
// For any ConstantExpr uses, we need to recursively search the users until
|
|
// we see a function.
|
|
while (!Stack.empty()) {
|
|
const Constant *C = Stack.pop_back_val();
|
|
if (visitUsers(&GV, C)) {
|
|
UsedLDS.insert(&GV);
|
|
Stack.clear();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
const DataLayout &DL = Mod->getDataLayout();
|
|
SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
|
|
AllocatedSizes.reserve(UsedLDS.size());
|
|
|
|
for (const GlobalVariable *GV : UsedLDS) {
|
|
Align Alignment =
|
|
DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
|
|
uint64_t AllocSize = GV->getGlobalSize(DL);
|
|
|
|
// HIP uses an extern unsized array in local address space for dynamically
|
|
// allocated shared memory. In that case, we have to disable the promotion.
|
|
if (GV->hasExternalLinkage() && AllocSize == 0) {
|
|
LocalMemLimit = 0;
|
|
LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
|
|
"local memory. Promoting to local memory "
|
|
"disabled.\n");
|
|
return false;
|
|
}
|
|
|
|
AllocatedSizes.emplace_back(AllocSize, Alignment);
|
|
}
|
|
|
|
// Sort to try to estimate the worst case alignment padding
|
|
//
|
|
// FIXME: We should really do something to fix the addresses to a more optimal
|
|
// value instead
|
|
llvm::sort(AllocatedSizes, llvm::less_second());
|
|
|
|
// Check how much local memory is being used by global objects
|
|
CurrentLocalMemUsage = 0;
|
|
|
|
// FIXME: Try to account for padding here. The real padding and address is
|
|
// currently determined from the inverse order of uses in the function when
|
|
// legalizing, which could also potentially change. We try to estimate the
|
|
// worst case here, but we probably should fix the addresses earlier.
|
|
for (auto Alloc : AllocatedSizes) {
|
|
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
|
|
CurrentLocalMemUsage += Alloc.first;
|
|
}
|
|
|
|
unsigned MaxOccupancy =
|
|
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
|
|
.second;
|
|
|
|
// Round up to the next tier of usage.
|
|
unsigned MaxSizeWithWaveCount =
|
|
ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
|
|
|
|
// Program may already use more LDS than is usable at maximum occupancy.
|
|
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
|
|
return false;
|
|
|
|
LocalMemLimit = MaxSizeWithWaveCount;
|
|
|
|
LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
|
|
<< " bytes of LDS\n"
|
|
<< " Rounding size to " << MaxSizeWithWaveCount
|
|
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
|
|
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
|
|
<< " available for promotion\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
// FIXME: Should try to pick the most likely to be profitable allocas first.
|
|
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
|
|
AllocaAnalysis &AA, bool SufficientLDS,
|
|
SetVector<IntrinsicInst *> &DeferredIntrs) {
|
|
LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
|
|
|
|
// Not likely to have sufficient local memory for promotion.
|
|
if (!SufficientLDS)
|
|
return false;
|
|
|
|
const DataLayout &DL = Mod->getDataLayout();
|
|
IRBuilder<> Builder(AA.Alloca);
|
|
|
|
const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
|
|
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
|
|
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
|
|
|
|
Align Alignment = AA.Alloca->getAlign();
|
|
|
|
// FIXME: This computed padding is likely wrong since it depends on inverse
|
|
// usage order.
|
|
//
|
|
// FIXME: It is also possible that if we're allowed to use all of the memory
|
|
// could end up using more than the maximum due to alignment padding.
|
|
|
|
uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
|
|
std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
|
|
if (!ElemSize || ElemSize->isScalable())
|
|
return false;
|
|
TypeSize AllocSize = WorkGroupSize * *ElemSize;
|
|
NewSize += AllocSize.getFixedValue();
|
|
|
|
if (NewSize > LocalMemLimit) {
|
|
LLVM_DEBUG(dbgs() << " " << AllocSize
|
|
<< " bytes of local memory not available to promote\n");
|
|
return false;
|
|
}
|
|
|
|
CurrentLocalMemUsage = NewSize;
|
|
|
|
LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
|
|
|
|
Function *F = AA.Alloca->getFunction();
|
|
|
|
Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
|
|
GlobalVariable *GV = new GlobalVariable(
|
|
*Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
|
|
Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
|
|
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
|
|
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
|
|
GV->setAlignment(AA.Alloca->getAlign());
|
|
|
|
Value *TCntY, *TCntZ;
|
|
|
|
std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
|
|
Value *TIdX = getWorkitemID(Builder, 0);
|
|
Value *TIdY = getWorkitemID(Builder, 1);
|
|
Value *TIdZ = getWorkitemID(Builder, 2);
|
|
|
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
|
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
|
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
|
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
|
|
TID = Builder.CreateAdd(TID, TIdZ);
|
|
|
|
LLVMContext &Context = Mod->getContext();
|
|
Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
|
|
|
|
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
|
|
AA.Alloca->mutateType(Offset->getType());
|
|
AA.Alloca->replaceAllUsesWith(Offset);
|
|
AA.Alloca->eraseFromParent();
|
|
|
|
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
|
|
|
|
for (Value *V : AA.LDS.Worklist) {
|
|
CallInst *Call = dyn_cast<CallInst>(V);
|
|
if (!Call) {
|
|
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
|
|
Value *LHS = CI->getOperand(0);
|
|
Value *RHS = CI->getOperand(1);
|
|
|
|
Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
|
|
CI->setOperand(0, Constant::getNullValue(NewTy));
|
|
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
|
|
CI->setOperand(1, Constant::getNullValue(NewTy));
|
|
|
|
continue;
|
|
}
|
|
|
|
// The operand's value should be corrected on its own and we don't want to
|
|
// touch the users.
|
|
if (isa<AddrSpaceCastInst>(V))
|
|
continue;
|
|
|
|
assert(V->getType()->isPtrOrPtrVectorTy());
|
|
|
|
Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
|
|
V->mutateType(NewTy);
|
|
|
|
// Adjust the types of any constant operands.
|
|
if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1)))
|
|
SI->setOperand(1, Constant::getNullValue(NewTy));
|
|
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2)))
|
|
SI->setOperand(2, Constant::getNullValue(NewTy));
|
|
} else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
|
|
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
|
|
if (isa<ConstantPointerNull, ConstantAggregateZero>(
|
|
Phi->getIncomingValue(I)))
|
|
Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
|
|
Builder.SetInsertPoint(Intr);
|
|
switch (Intr->getIntrinsicID()) {
|
|
case Intrinsic::lifetime_start:
|
|
case Intrinsic::lifetime_end:
|
|
// These intrinsics are for address space 0 only
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
case Intrinsic::memcpy:
|
|
case Intrinsic::memmove:
|
|
// These have 2 pointer operands. In case if second pointer also needs
|
|
// to be replaced we defer processing of these intrinsics until all
|
|
// other values are processed.
|
|
DeferredIntrs.insert(Intr);
|
|
continue;
|
|
case Intrinsic::memset: {
|
|
MemSetInst *MemSet = cast<MemSetInst>(Intr);
|
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
|
|
MemSet->getLength(), MemSet->getDestAlign(),
|
|
MemSet->isVolatile());
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
case Intrinsic::invariant_start:
|
|
case Intrinsic::invariant_end:
|
|
case Intrinsic::launder_invariant_group:
|
|
case Intrinsic::strip_invariant_group: {
|
|
SmallVector<Value *> Args;
|
|
if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
|
|
Args.emplace_back(Intr->getArgOperand(0));
|
|
} else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
|
|
Args.emplace_back(Intr->getArgOperand(0));
|
|
Args.emplace_back(Intr->getArgOperand(1));
|
|
}
|
|
Args.emplace_back(Offset);
|
|
Function *F = Intrinsic::getOrInsertDeclaration(
|
|
Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
|
|
CallInst *NewIntr =
|
|
CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
|
|
Intr->mutateType(NewIntr->getType());
|
|
Intr->replaceAllUsesWith(NewIntr);
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
case Intrinsic::objectsize: {
|
|
Value *Src = Intr->getOperand(0);
|
|
|
|
CallInst *NewCall = Builder.CreateIntrinsic(
|
|
Intrinsic::objectsize,
|
|
{Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)},
|
|
{Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
|
|
Intr->replaceAllUsesWith(NewCall);
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
default:
|
|
Intr->print(errs());
|
|
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
|
|
SetVector<IntrinsicInst *> &DeferredIntrs) {
|
|
|
|
for (IntrinsicInst *Intr : DeferredIntrs) {
|
|
IRBuilder<> Builder(Intr);
|
|
Builder.SetInsertPoint(Intr);
|
|
Intrinsic::ID ID = Intr->getIntrinsicID();
|
|
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
|
|
|
|
MemTransferInst *MI = cast<MemTransferInst>(Intr);
|
|
auto *B = Builder.CreateMemTransferInst(
|
|
ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
|
|
MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
|
|
|
|
for (unsigned I = 0; I != 2; ++I) {
|
|
if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
|
|
B->addDereferenceableParamAttr(I, Bytes);
|
|
}
|
|
}
|
|
|
|
Intr->eraseFromParent();
|
|
}
|
|
}
|