1701 lines
59 KiB
C++
1701 lines
59 KiB
C++
//===- AMDGPUAttributor.cpp -----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Transforms/IPO/Attributor.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-attributor"
|
|
|
|
using namespace llvm;
|
|
|
|
static cl::opt<unsigned> IndirectCallSpecializationThreshold(
|
|
"amdgpu-indirect-call-specialization-threshold",
|
|
cl::desc(
|
|
"A threshold controls whether an indirect call will be specialized"),
|
|
cl::init(3));
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
|
|
|
|
enum ImplicitArgumentPositions {
|
|
#include "AMDGPUAttributes.def"
|
|
LAST_ARG_POS
|
|
};
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
|
|
|
|
enum ImplicitArgumentMask {
|
|
UNKNOWN_INTRINSIC = 0,
|
|
#include "AMDGPUAttributes.def"
|
|
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
|
|
NOT_IMPLICIT_INPUT
|
|
};
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
|
|
static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
|
|
ImplicitAttrs[] = {
|
|
#include "AMDGPUAttributes.def"
|
|
};
|
|
|
|
// We do not need to note the x workitem or workgroup id because they are always
|
|
// initialized.
|
|
//
|
|
// TODO: We should not add the attributes if the known compile time workgroup
|
|
// size is 1 for y/z.
|
|
static ImplicitArgumentMask
|
|
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
|
|
bool HasApertureRegs, bool SupportsGetDoorBellID,
|
|
unsigned CodeObjectVersion) {
|
|
switch (ID) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
NonKernelOnly = true;
|
|
return WORKITEM_ID_X;
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
NonKernelOnly = true;
|
|
return WORKGROUP_ID_X;
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
return WORKITEM_ID_Y;
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
return WORKITEM_ID_Z;
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
case Intrinsic::r600_read_tgid_y:
|
|
return WORKGROUP_ID_Y;
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
case Intrinsic::r600_read_tgid_z:
|
|
return WORKGROUP_ID_Z;
|
|
case Intrinsic::amdgcn_cluster_id_x:
|
|
NonKernelOnly = true;
|
|
return CLUSTER_ID_X;
|
|
case Intrinsic::amdgcn_cluster_id_y:
|
|
return CLUSTER_ID_Y;
|
|
case Intrinsic::amdgcn_cluster_id_z:
|
|
return CLUSTER_ID_Z;
|
|
case Intrinsic::amdgcn_lds_kernel_id:
|
|
return LDS_KERNEL_ID;
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return DISPATCH_PTR;
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return DISPATCH_ID;
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return IMPLICIT_ARG_PTR;
|
|
// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
|
|
// queue_ptr.
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
|
|
return QUEUE_PTR;
|
|
case Intrinsic::amdgcn_is_shared:
|
|
case Intrinsic::amdgcn_is_private:
|
|
if (HasApertureRegs)
|
|
return NOT_IMPLICIT_INPUT;
|
|
// Under V5, we need implicitarg_ptr + offsets to access private_base or
|
|
// shared_base. For pre-V5, however, need to access them through queue_ptr +
|
|
// offsets.
|
|
return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
|
|
: QUEUE_PTR;
|
|
case Intrinsic::amdgcn_wwm:
|
|
case Intrinsic::amdgcn_strict_wwm:
|
|
return WHOLE_WAVE_MODE;
|
|
case Intrinsic::trap:
|
|
case Intrinsic::debugtrap:
|
|
case Intrinsic::ubsantrap:
|
|
if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
|
|
return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
|
|
: QUEUE_PTR;
|
|
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
|
|
return QUEUE_PTR;
|
|
default:
|
|
return UNKNOWN_INTRINSIC;
|
|
}
|
|
}
|
|
|
|
static bool castRequiresQueuePtr(unsigned SrcAS) {
|
|
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
|
|
}
|
|
|
|
static bool isDSAddress(const Constant *C) {
|
|
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
|
|
if (!GV)
|
|
return false;
|
|
unsigned AS = GV->getAddressSpace();
|
|
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
|
|
}
|
|
|
|
/// Returns true if sanitizer attributes are present on a function.
|
|
static bool hasSanitizerAttributes(const Function &F) {
|
|
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
|
|
F.hasFnAttribute(Attribute::SanitizeThread) ||
|
|
F.hasFnAttribute(Attribute::SanitizeMemory) ||
|
|
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
|
|
F.hasFnAttribute(Attribute::SanitizeMemTag);
|
|
}
|
|
|
|
namespace {
|
|
class AMDGPUInformationCache : public InformationCache {
|
|
public:
|
|
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
|
|
BumpPtrAllocator &Allocator,
|
|
SetVector<Function *> *CGSCC, TargetMachine &TM)
|
|
: InformationCache(M, AG, Allocator, CGSCC), TM(TM),
|
|
CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
|
|
|
|
TargetMachine &TM;
|
|
|
|
enum ConstantStatus : uint8_t {
|
|
NONE = 0,
|
|
DS_GLOBAL = 1 << 0,
|
|
ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
|
|
ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
|
|
ADDR_SPACE_CAST_BOTH_TO_FLAT =
|
|
ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
|
|
};
|
|
|
|
/// Check if the subtarget has aperture regs.
|
|
bool hasApertureRegs(Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.hasApertureRegs();
|
|
}
|
|
|
|
/// Check if the subtarget supports GetDoorbellID.
|
|
bool supportsGetDoorbellID(Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.supportsGetDoorbellID();
|
|
}
|
|
|
|
std::optional<std::pair<unsigned, unsigned>>
|
|
getFlatWorkGroupSizeAttr(const Function &F) const {
|
|
auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
|
|
if (!R)
|
|
return std::nullopt;
|
|
return std::make_pair(R->first, *(R->second));
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
getDefaultFlatWorkGroupSize(const Function &F) const {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
getMaximumFlatWorkGroupRange(const Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
|
|
}
|
|
|
|
SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.getMaxNumWorkGroups(F);
|
|
}
|
|
|
|
/// Get code object version.
|
|
unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
|
|
|
|
std::optional<std::pair<unsigned, unsigned>>
|
|
getWavesPerEUAttr(const Function &F) {
|
|
auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
|
|
/*OnlyFirstRequired=*/true);
|
|
if (!Val)
|
|
return std::nullopt;
|
|
if (!Val->second) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
Val->second = ST.getMaxWavesPerEU();
|
|
}
|
|
return std::make_pair(Val->first, *(Val->second));
|
|
}
|
|
|
|
unsigned getMaxWavesPerEU(const Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.getMaxWavesPerEU();
|
|
}
|
|
|
|
unsigned getMaxAddrSpace() const override {
|
|
return AMDGPUAS::MAX_AMDGPU_ADDRESS;
|
|
}
|
|
|
|
private:
|
|
/// Check if the ConstantExpr \p CE uses an addrspacecast from private or
|
|
/// local to flat. These casts may require the queue pointer.
|
|
static uint8_t visitConstExpr(const ConstantExpr *CE) {
|
|
uint8_t Status = NONE;
|
|
|
|
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
|
|
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
|
|
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
|
|
Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
|
|
else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
|
|
Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
|
|
}
|
|
|
|
return Status;
|
|
}
|
|
|
|
/// Get the constant access bitmap for \p C.
|
|
uint8_t getConstantAccess(const Constant *C,
|
|
SmallPtrSetImpl<const Constant *> &Visited) {
|
|
auto It = ConstantStatus.find(C);
|
|
if (It != ConstantStatus.end())
|
|
return It->second;
|
|
|
|
uint8_t Result = 0;
|
|
if (isDSAddress(C))
|
|
Result = DS_GLOBAL;
|
|
|
|
if (const auto *CE = dyn_cast<ConstantExpr>(C))
|
|
Result |= visitConstExpr(CE);
|
|
|
|
for (const Use &U : C->operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC || !Visited.insert(OpC).second)
|
|
continue;
|
|
|
|
Result |= getConstantAccess(OpC, Visited);
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
public:
|
|
/// Returns true if \p Fn needs the queue pointer because of \p C.
|
|
bool needsQueuePtr(const Constant *C, Function &Fn) {
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
|
|
bool HasAperture = hasApertureRegs(Fn);
|
|
|
|
// No need to explore the constants.
|
|
if (!IsNonEntryFunc && HasAperture)
|
|
return false;
|
|
|
|
SmallPtrSet<const Constant *, 8> Visited;
|
|
uint8_t Access = getConstantAccess(C, Visited);
|
|
|
|
// We need to trap on DS globals in non-entry functions.
|
|
if (IsNonEntryFunc && (Access & DS_GLOBAL))
|
|
return true;
|
|
|
|
return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
|
|
}
|
|
|
|
bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
|
|
SmallPtrSet<const Constant *, 8> Visited;
|
|
uint8_t Access = getConstantAccess(C, Visited);
|
|
return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
|
|
}
|
|
|
|
private:
|
|
/// Used to determine if the Constant needs the queue pointer.
|
|
DenseMap<const Constant *, uint8_t> ConstantStatus;
|
|
const unsigned CodeObjectVersion;
|
|
};
|
|
|
|
struct AAAMDAttributes
|
|
: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
|
|
AbstractAttribute> {
|
|
using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
|
|
AbstractAttribute>;
|
|
|
|
AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDAttributes &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName().
|
|
StringRef getName() const override { return "AAAMDAttributes"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDAttributes.
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
const char AAAMDAttributes::ID = 0;
|
|
|
|
struct AAUniformWorkGroupSize
|
|
: public StateWrapper<BooleanState, AbstractAttribute> {
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName().
|
|
StringRef getName() const override { return "AAUniformWorkGroupSize"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDAttributes.
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
const char AAUniformWorkGroupSize::ID = 0;
|
|
|
|
struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
|
|
AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
|
|
: AAUniformWorkGroupSize(IRP, A) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
CallingConv::ID CC = F->getCallingConv();
|
|
|
|
if (CC != CallingConv::AMDGPU_KERNEL)
|
|
return;
|
|
|
|
bool InitialValue = F->hasFnAttribute("uniform-work-group-size");
|
|
|
|
if (InitialValue)
|
|
indicateOptimisticFixpoint();
|
|
else
|
|
indicatePessimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
|
|
<< "->" << getAssociatedFunction()->getName() << "\n");
|
|
|
|
const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
if (!CallerInfo || !CallerInfo->isValidState())
|
|
return false;
|
|
|
|
Change = Change | clampStateAndIndicateChange(this->getState(),
|
|
CallerInfo->getState());
|
|
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
if (!getAssumed())
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
LLVMContext &Ctx = getAssociatedFunction()->getContext();
|
|
return A.manifestAttrs(getIRPosition(),
|
|
{Attribute::get(Ctx, "uniform-work-group-size")},
|
|
/*ForceReplace=*/true);
|
|
}
|
|
|
|
bool isValidState() const override {
|
|
// This state is always valid, even when the state is false.
|
|
return true;
|
|
}
|
|
|
|
const std::string getAsStr(Attributor *) const override {
|
|
return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
|
|
}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
};
|
|
|
|
AAUniformWorkGroupSize &
|
|
AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
|
|
llvm_unreachable(
|
|
"AAUniformWorkGroupSize is only valid for function position");
|
|
}
|
|
|
|
struct AAAMDAttributesFunction : public AAAMDAttributes {
|
|
AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
|
|
: AAAMDAttributes(IRP, A) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
|
|
// If the function requires the implicit arg pointer due to sanitizers,
|
|
// assume it's needed even if explicitly marked as not requiring it.
|
|
// Flat scratch initialization is needed because `asan_malloc_impl`
|
|
// calls introduced later in pipeline will have flat scratch accesses.
|
|
// FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
|
|
// implementation for `asan_malloc_impl` is updated.
|
|
const bool HasSanitizerAttrs = hasSanitizerAttributes(*F);
|
|
if (HasSanitizerAttrs) {
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
removeAssumedBits(HOSTCALL_PTR);
|
|
removeAssumedBits(FLAT_SCRATCH_INIT);
|
|
}
|
|
|
|
for (auto Attr : ImplicitAttrs) {
|
|
if (HasSanitizerAttrs &&
|
|
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
|
|
Attr.first == FLAT_SCRATCH_INIT))
|
|
continue;
|
|
|
|
if (F->hasFnAttribute(Attr.second))
|
|
addKnownBits(Attr.first);
|
|
}
|
|
|
|
if (F->isDeclaration())
|
|
return;
|
|
|
|
// Ignore functions with graphics calling conventions, these are currently
|
|
// not allowed to have kernel arguments.
|
|
if (AMDGPU::isGraphics(F->getCallingConv())) {
|
|
indicatePessimisticFixpoint();
|
|
return;
|
|
}
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
// The current assumed state used to determine a change.
|
|
auto OrigAssumed = getAssumed();
|
|
|
|
// Check for Intrinsics and propagate attributes.
|
|
const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
|
|
*this, this->getIRPosition(), DepClassTy::REQUIRED);
|
|
if (!AAEdges || !AAEdges->isValidState() ||
|
|
AAEdges->hasNonAsmUnknownCallee())
|
|
return indicatePessimisticFixpoint();
|
|
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
|
|
|
|
bool NeedsImplicit = false;
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
|
|
bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
|
|
unsigned COV = InfoCache.getCodeObjectVersion();
|
|
|
|
for (Function *Callee : AAEdges->getOptimisticEdges()) {
|
|
Intrinsic::ID IID = Callee->getIntrinsicID();
|
|
if (IID == Intrinsic::not_intrinsic) {
|
|
const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
|
|
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
|
|
if (!AAAMD || !AAAMD->isValidState())
|
|
return indicatePessimisticFixpoint();
|
|
*this &= *AAAMD;
|
|
continue;
|
|
}
|
|
|
|
bool NonKernelOnly = false;
|
|
ImplicitArgumentMask AttrMask =
|
|
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
|
|
HasApertureRegs, SupportsGetDoorbellID, COV);
|
|
|
|
if (AttrMask == UNKNOWN_INTRINSIC) {
|
|
// Assume not-nocallback intrinsics may invoke a function which accesses
|
|
// implicit arguments.
|
|
//
|
|
// FIXME: This isn't really the correct check. We want to ensure it
|
|
// isn't calling any function that may use implicit arguments regardless
|
|
// of whether it's internal to the module or not.
|
|
//
|
|
// TODO: Ignoring callsite attributes.
|
|
if (!Callee->hasFnAttribute(Attribute::NoCallback))
|
|
return indicatePessimisticFixpoint();
|
|
continue;
|
|
}
|
|
|
|
if (AttrMask != NOT_IMPLICIT_INPUT) {
|
|
if ((IsNonEntryFunc || !NonKernelOnly))
|
|
removeAssumedBits(AttrMask);
|
|
}
|
|
}
|
|
|
|
// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
|
|
if (NeedsImplicit)
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
|
|
if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
|
|
// Under V5, we need implicitarg_ptr + offsets to access private_base or
|
|
// shared_base. We do not actually need queue_ptr.
|
|
if (COV >= 5)
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
else
|
|
removeAssumedBits(QUEUE_PTR);
|
|
}
|
|
|
|
if (funcRetrievesMultigridSyncArg(A, COV)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) &&
|
|
"multigrid_sync_arg needs implicitarg_ptr");
|
|
removeAssumedBits(MULTIGRID_SYNC_ARG);
|
|
}
|
|
|
|
if (funcRetrievesHostcallPtr(A, COV)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
|
|
removeAssumedBits(HOSTCALL_PTR);
|
|
}
|
|
|
|
if (funcRetrievesHeapPtr(A, COV)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
|
|
removeAssumedBits(HEAP_PTR);
|
|
}
|
|
|
|
if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
|
|
removeAssumedBits(QUEUE_PTR);
|
|
}
|
|
|
|
if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
|
|
removeAssumedBits(LDS_KERNEL_ID);
|
|
}
|
|
|
|
if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
|
|
removeAssumedBits(DEFAULT_QUEUE);
|
|
|
|
if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
|
|
removeAssumedBits(COMPLETION_ACTION);
|
|
|
|
if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
|
|
removeAssumedBits(FLAT_SCRATCH_INIT);
|
|
|
|
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
|
|
: ChangeStatus::UNCHANGED;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
SmallVector<Attribute, 8> AttrList;
|
|
LLVMContext &Ctx = getAssociatedFunction()->getContext();
|
|
|
|
for (auto Attr : ImplicitAttrs) {
|
|
if (isKnown(Attr.first))
|
|
AttrList.push_back(Attribute::get(Ctx, Attr.second));
|
|
}
|
|
|
|
return A.manifestAttrs(getIRPosition(), AttrList,
|
|
/* ForceReplace */ true);
|
|
}
|
|
|
|
const std::string getAsStr(Attributor *) const override {
|
|
std::string Str;
|
|
raw_string_ostream OS(Str);
|
|
OS << "AMDInfo[";
|
|
for (auto Attr : ImplicitAttrs)
|
|
if (isAssumed(Attr.first))
|
|
OS << ' ' << Attr.second;
|
|
OS << " ]";
|
|
return OS.str();
|
|
}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
|
|
private:
|
|
bool checkForQueuePtr(Attributor &A) {
|
|
Function *F = getAssociatedFunction();
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
|
|
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
bool NeedsQueuePtr = false;
|
|
|
|
auto CheckAddrSpaceCasts = [&](Instruction &I) {
|
|
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
|
|
if (castRequiresQueuePtr(SrcAS)) {
|
|
NeedsQueuePtr = true;
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
|
|
|
|
// `checkForAllInstructions` is much more cheaper than going through all
|
|
// instructions, try it first.
|
|
|
|
// The queue pointer is not needed if aperture regs is present.
|
|
if (!HasApertureRegs) {
|
|
bool UsedAssumedInformation = false;
|
|
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
|
|
{Instruction::AddrSpaceCast},
|
|
UsedAssumedInformation);
|
|
}
|
|
|
|
// If we found that we need the queue pointer, nothing else to do.
|
|
if (NeedsQueuePtr)
|
|
return true;
|
|
|
|
if (!IsNonEntryFunc && HasApertureRegs)
|
|
return false;
|
|
|
|
for (BasicBlock &BB : *F) {
|
|
for (Instruction &I : BB) {
|
|
for (const Use &U : I.operands()) {
|
|
if (const auto *C = dyn_cast<Constant>(U)) {
|
|
if (InfoCache.needsQueuePtr(C, *F))
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
|
|
auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
|
|
AA::RangeTy Range(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
|
|
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
|
|
AA::RangeTy Range(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
|
|
auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
|
|
AA::RangeTy Range(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
|
|
auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
|
|
AA::RangeTy Range(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
|
|
if (COV < 5)
|
|
return false;
|
|
AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
|
|
if (COV < 5)
|
|
return false;
|
|
AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
|
|
return funcRetrievesImplicitKernelArg(A, Range);
|
|
}
|
|
|
|
bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
|
|
// Check if this is a call to the implicitarg_ptr builtin and it
|
|
// is used to retrieve the hostcall pointer. The implicit arg for
|
|
// hostcall is not used only if every use of the implicitarg_ptr
|
|
// is a load that clearly does not retrieve any byte of the
|
|
// hostcall pointer. We check this by tracing all the uses of the
|
|
// initial call to the implicitarg_ptr intrinsic.
|
|
auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
|
|
auto &Call = cast<CallBase>(I);
|
|
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
|
|
return true;
|
|
|
|
const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
|
|
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
|
|
if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
|
|
return false;
|
|
|
|
return PointerInfoAA->forallInterferingAccesses(
|
|
Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
|
|
return Acc.getRemoteInst()->isDroppable();
|
|
});
|
|
};
|
|
|
|
bool UsedAssumedInformation = false;
|
|
return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
|
|
UsedAssumedInformation);
|
|
}
|
|
|
|
bool funcRetrievesLDSKernelId(Attributor &A) {
|
|
auto DoesNotRetrieve = [&](Instruction &I) {
|
|
auto &Call = cast<CallBase>(I);
|
|
return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
|
|
};
|
|
bool UsedAssumedInformation = false;
|
|
return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
|
|
UsedAssumedInformation);
|
|
}
|
|
|
|
// Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
|
|
// not to be set.
|
|
bool needFlatScratchInit(Attributor &A) {
|
|
assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
|
|
|
|
// Check all AddrSpaceCast instructions. FlatScratchInit is needed if
|
|
// there is a cast from PRIVATE_ADDRESS.
|
|
auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
|
|
return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
|
|
AMDGPUAS::PRIVATE_ADDRESS;
|
|
};
|
|
|
|
bool UsedAssumedInformation = false;
|
|
if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
|
|
{Instruction::AddrSpaceCast},
|
|
UsedAssumedInformation))
|
|
return true;
|
|
|
|
// Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
Function *F = getAssociatedFunction();
|
|
for (Instruction &I : instructions(F)) {
|
|
for (const Use &U : I.operands()) {
|
|
if (const auto *C = dyn_cast<Constant>(U)) {
|
|
if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Finally check callees.
|
|
|
|
// This is called on each callee; false means callee shouldn't have
|
|
// no-flat-scratch-init.
|
|
auto CheckForNoFlatScratchInit = [&](Instruction &I) {
|
|
const auto &CB = cast<CallBase>(I);
|
|
const Function *Callee = CB.getCalledFunction();
|
|
|
|
// Callee == 0 for inline asm or indirect call with known callees.
|
|
// In the latter case, updateImpl() already checked the callees and we
|
|
// know their FLAT_SCRATCH_INIT bit is set.
|
|
// If function has indirect call with unknown callees, the bit is
|
|
// already removed in updateImpl() and execution won't reach here.
|
|
if (!Callee)
|
|
return true;
|
|
|
|
return Callee->getIntrinsicID() !=
|
|
Intrinsic::amdgcn_addrspacecast_nonnull;
|
|
};
|
|
|
|
UsedAssumedInformation = false;
|
|
// If any callee is false (i.e. need FlatScratchInit),
|
|
// checkForAllCallLikeInstructions returns false, in which case this
|
|
// function returns true.
|
|
return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
|
|
UsedAssumedInformation);
|
|
}
|
|
};
|
|
|
|
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
|
|
llvm_unreachable("AAAMDAttributes is only valid for function position");
|
|
}
|
|
|
|
/// Base class to derive different size ranges.
|
|
struct AAAMDSizeRangeAttribute
|
|
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
|
|
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
|
|
|
|
StringRef AttrName;
|
|
|
|
AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
|
|
StringRef AttrName)
|
|
: Base(IRP, 32), AttrName(AttrName) {}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
|
|
template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
|
|
<< "->" << getAssociatedFunction()->getName() << '\n');
|
|
|
|
const auto *CallerInfo = A.getAAFor<AttributeImpl>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
if (!CallerInfo || !CallerInfo->isValidState())
|
|
return false;
|
|
|
|
Change |=
|
|
clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
|
|
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this,
|
|
/*RequireAllCallSites=*/true,
|
|
AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
/// Clamp the assumed range to the default value ([Min, Max]) and emit the
|
|
/// attribute if it is not same as default.
|
|
ChangeStatus
|
|
emitAttributeIfNotDefaultAfterClamp(Attributor &A,
|
|
std::pair<unsigned, unsigned> Default) {
|
|
auto [Min, Max] = Default;
|
|
unsigned Lower = getAssumed().getLower().getZExtValue();
|
|
unsigned Upper = getAssumed().getUpper().getZExtValue();
|
|
|
|
// Clamp the range to the default value.
|
|
if (Lower < Min)
|
|
Lower = Min;
|
|
if (Upper > Max + 1)
|
|
Upper = Max + 1;
|
|
|
|
// No manifest if the value is invalid or same as default after clamp.
|
|
if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
Function *F = getAssociatedFunction();
|
|
LLVMContext &Ctx = F->getContext();
|
|
SmallString<10> Buffer;
|
|
raw_svector_ostream OS(Buffer);
|
|
OS << Lower << ',' << Upper - 1;
|
|
return A.manifestAttrs(getIRPosition(),
|
|
{Attribute::get(Ctx, AttrName, OS.str())},
|
|
/*ForceReplace=*/true);
|
|
}
|
|
|
|
const std::string getAsStr(Attributor *) const override {
|
|
std::string Str;
|
|
raw_string_ostream OS(Str);
|
|
OS << getName() << '[';
|
|
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
|
|
OS << ']';
|
|
return OS.str();
|
|
}
|
|
};
|
|
|
|
/// Propagate amdgpu-flat-work-group-size attribute.
|
|
struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
|
|
AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
|
|
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
bool HasAttr = false;
|
|
auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
|
|
auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
|
|
|
|
if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
|
|
// We only consider an attribute that is not max range because the front
|
|
// end always emits the attribute, unfortunately, and sometimes it emits
|
|
// the max range.
|
|
if (*Attr != MaxRange) {
|
|
Range = *Attr;
|
|
HasAttr = true;
|
|
}
|
|
}
|
|
|
|
// We don't want to directly clamp the state if it's the max range because
|
|
// that is basically the worst state.
|
|
if (Range == MaxRange)
|
|
return;
|
|
|
|
auto [Min, Max] = Range;
|
|
ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
|
|
IntegerRangeState IRS(CR);
|
|
clampStateAndIndicateChange(this->getState(), IRS);
|
|
|
|
if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
|
|
indicateOptimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
|
|
}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
return emitAttributeIfNotDefaultAfterClamp(
|
|
A, InfoCache.getMaximumFlatWorkGroupRange(*F));
|
|
}
|
|
|
|
/// See AbstractAttribute::getName()
|
|
StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDFlatWorkGroupSize
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDFlatWorkGroupSize::ID = 0;
|
|
|
|
AAAMDFlatWorkGroupSize &
|
|
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
|
|
llvm_unreachable(
|
|
"AAAMDFlatWorkGroupSize is only valid for function position");
|
|
}
|
|
|
|
struct TupleDecIntegerRangeState : public AbstractState {
|
|
DecIntegerState<uint32_t> X, Y, Z;
|
|
|
|
bool isValidState() const override {
|
|
return X.isValidState() && Y.isValidState() && Z.isValidState();
|
|
}
|
|
|
|
bool isAtFixpoint() const override {
|
|
return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
|
|
}
|
|
|
|
ChangeStatus indicateOptimisticFixpoint() override {
|
|
return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
|
|
Z.indicateOptimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus indicatePessimisticFixpoint() override {
|
|
return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
|
|
Z.indicatePessimisticFixpoint();
|
|
}
|
|
|
|
TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
|
|
X ^= Other.X;
|
|
Y ^= Other.Y;
|
|
Z ^= Other.Z;
|
|
return *this;
|
|
}
|
|
|
|
bool operator==(const TupleDecIntegerRangeState &Other) const {
|
|
return X == Other.X && Y == Other.Y && Z == Other.Z;
|
|
}
|
|
|
|
TupleDecIntegerRangeState &getAssumed() { return *this; }
|
|
const TupleDecIntegerRangeState &getAssumed() const { return *this; }
|
|
};
|
|
|
|
using AAAMDMaxNumWorkgroupsState =
|
|
StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
|
|
|
|
/// Propagate amdgpu-max-num-workgroups attribute.
|
|
struct AAAMDMaxNumWorkgroups
|
|
: public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
|
|
using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
|
|
|
|
AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
|
|
|
|
X.takeKnownMinimum(MaxNumWorkgroups[0]);
|
|
Y.takeKnownMinimum(MaxNumWorkgroups[1]);
|
|
Z.takeKnownMinimum(MaxNumWorkgroups[2]);
|
|
|
|
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
|
|
indicatePessimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
|
|
<< "->" << getAssociatedFunction()->getName() << '\n');
|
|
|
|
const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
if (!CallerInfo || !CallerInfo->isValidState())
|
|
return false;
|
|
|
|
Change |=
|
|
clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this,
|
|
/*RequireAllCallSites=*/true,
|
|
AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
LLVMContext &Ctx = F->getContext();
|
|
SmallString<32> Buffer;
|
|
raw_svector_ostream OS(Buffer);
|
|
OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
|
|
|
|
// TODO: Should annotate loads of the group size for this to do anything
|
|
// useful.
|
|
return A.manifestAttrs(
|
|
getIRPosition(),
|
|
{Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
|
|
/* ForceReplace= */ true);
|
|
}
|
|
|
|
StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
|
|
|
|
const std::string getAsStr(Attributor *) const override {
|
|
std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
|
|
raw_string_ostream OS(Buffer);
|
|
OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
|
|
<< ']';
|
|
return OS.str();
|
|
}
|
|
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDMaxNumWorkgroups
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDMaxNumWorkgroups::ID = 0;
|
|
|
|
AAAMDMaxNumWorkgroups &
|
|
AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
|
|
llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
|
|
}
|
|
|
|
/// Propagate amdgpu-waves-per-eu attribute.
|
|
struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
|
|
AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
|
|
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
// If the attribute exists, we will honor it if it is not the default.
|
|
if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
|
|
std::pair<unsigned, unsigned> MaxWavesPerEURange{
|
|
1U, InfoCache.getMaxWavesPerEU(*F)};
|
|
if (*Attr != MaxWavesPerEURange) {
|
|
auto [Min, Max] = *Attr;
|
|
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
|
|
IntegerRangeState RangeState(Range);
|
|
this->getState() = RangeState;
|
|
indicateOptimisticFixpoint();
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
|
|
indicatePessimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
Function *Func = getAssociatedFunction();
|
|
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
|
|
<< "->" << Func->getName() << '\n');
|
|
(void)Func;
|
|
|
|
const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
if (!CallerAA || !CallerAA->isValidState())
|
|
return false;
|
|
|
|
ConstantRange Assumed = getAssumed();
|
|
unsigned Min = std::max(Assumed.getLower().getZExtValue(),
|
|
CallerAA->getAssumed().getLower().getZExtValue());
|
|
unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
|
|
CallerAA->getAssumed().getUpper().getZExtValue());
|
|
ConstantRange Range(APInt(32, Min), APInt(32, Max));
|
|
IntegerRangeState RangeState(Range);
|
|
getState() = RangeState;
|
|
Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
|
|
: ChangeStatus::CHANGED;
|
|
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
return emitAttributeIfNotDefaultAfterClamp(
|
|
A, {1U, InfoCache.getMaxWavesPerEU(*F)});
|
|
}
|
|
|
|
/// See AbstractAttribute::getName()
|
|
StringRef getName() const override { return "AAAMDWavesPerEU"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDWavesPerEU
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDWavesPerEU::ID = 0;
|
|
|
|
AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
|
|
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
|
|
}
|
|
|
|
/// Compute the minimum number of AGPRs required to allocate the inline asm.
|
|
static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
|
|
const CallBase &Call) {
|
|
unsigned ArgNo = 0;
|
|
unsigned ResNo = 0;
|
|
unsigned AGPRDefCount = 0;
|
|
unsigned AGPRUseCount = 0;
|
|
unsigned MaxPhysReg = 0;
|
|
const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
|
|
|
|
// TODO: Overestimates due to not accounting for tied operands
|
|
for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
|
|
Type *Ty = nullptr;
|
|
switch (CI.Type) {
|
|
case InlineAsm::isOutput: {
|
|
Ty = Call.getType();
|
|
if (auto *STy = dyn_cast<StructType>(Ty))
|
|
Ty = STy->getElementType(ResNo);
|
|
++ResNo;
|
|
break;
|
|
}
|
|
case InlineAsm::isInput: {
|
|
Ty = Call.getArgOperand(ArgNo++)->getType();
|
|
break;
|
|
}
|
|
case InlineAsm::isLabel:
|
|
continue;
|
|
case InlineAsm::isClobber:
|
|
// Parse the physical register reference.
|
|
break;
|
|
}
|
|
|
|
for (StringRef Code : CI.Codes) {
|
|
unsigned RegCount = 0;
|
|
if (Code.starts_with("a")) {
|
|
// Virtual register, compute number of registers based on the type.
|
|
//
|
|
// We ought to be going through TargetLowering to get the number of
|
|
// registers, but we should avoid the dependence on CodeGen here.
|
|
RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
|
|
} else {
|
|
// Physical register reference
|
|
auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
|
|
if (Kind == 'a') {
|
|
RegCount = NumRegs;
|
|
MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
if (CI.Type == InlineAsm::isOutput) {
|
|
// Apply tuple alignment requirement
|
|
//
|
|
// TODO: This is more conservative than necessary.
|
|
AGPRDefCount = alignTo(AGPRDefCount, RegCount);
|
|
|
|
AGPRDefCount += RegCount;
|
|
if (CI.isEarlyClobber) {
|
|
AGPRUseCount = alignTo(AGPRUseCount, RegCount);
|
|
AGPRUseCount += RegCount;
|
|
}
|
|
} else {
|
|
AGPRUseCount = alignTo(AGPRUseCount, RegCount);
|
|
AGPRUseCount += RegCount;
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
|
|
|
|
// TODO: This is overly conservative. If there are any physical registers,
|
|
// allocate any virtual registers after them so we don't have to solve optimal
|
|
// packing.
|
|
return std::min(MaxVirtReg + MaxPhysReg, 256u);
|
|
}
|
|
|
|
struct AAAMDGPUMinAGPRAlloc
|
|
: public StateWrapper<DecIntegerState<>, AbstractAttribute> {
|
|
using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
|
|
AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
|
|
llvm_unreachable(
|
|
"AAAMDGPUMinAGPRAlloc is only valid for function position");
|
|
}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto [MinNumAGPR, MaxNumAGPR] =
|
|
AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
|
|
/*OnlyFirstRequired=*/true);
|
|
if (MinNumAGPR == 0)
|
|
indicateOptimisticFixpoint();
|
|
}
|
|
|
|
const std::string getAsStr(Attributor *A) const override {
|
|
std::string Str = "amdgpu-agpr-alloc=";
|
|
raw_string_ostream OS(Str);
|
|
OS << getAssumed();
|
|
return OS.str();
|
|
}
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
DecIntegerState<> Maximum;
|
|
|
|
// Check for cases which require allocation of AGPRs. The only cases where
|
|
// AGPRs are required are if there are direct references to AGPRs, so inline
|
|
// assembly and special intrinsics.
|
|
auto CheckForMinAGPRAllocs = [&](Instruction &I) {
|
|
const auto &CB = cast<CallBase>(I);
|
|
const Value *CalleeOp = CB.getCalledOperand();
|
|
|
|
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) {
|
|
// Technically, the inline asm could be invoking a call to an unknown
|
|
// external function that requires AGPRs, but ignore that.
|
|
unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
|
|
Maximum.takeAssumedMaximum(NumRegs);
|
|
return true;
|
|
}
|
|
switch (CB.getIntrinsicID()) {
|
|
case Intrinsic::not_intrinsic:
|
|
break;
|
|
case Intrinsic::write_register:
|
|
case Intrinsic::read_register:
|
|
case Intrinsic::read_volatile_register: {
|
|
const MDString *RegName = cast<MDString>(
|
|
cast<MDNode>(
|
|
cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata())
|
|
->getOperand(0));
|
|
auto [Kind, RegIdx, NumRegs] =
|
|
AMDGPU::parseAsmPhysRegName(RegName->getString());
|
|
if (Kind == 'a')
|
|
Maximum.takeAssumedMaximum(std::min(RegIdx + NumRegs, 256u));
|
|
|
|
return true;
|
|
}
|
|
// Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
|
|
// the nocallback attribute, so the AMDGPU attributor can conservatively
|
|
// drop all implicitly-known inputs and AGPR allocation information. Make
|
|
// sure we still infer that no implicit inputs are required and that the
|
|
// AGPR allocation stays at zero. Trap-like intrinsics may invoke a
|
|
// function which requires AGPRs, so we need to check if the called
|
|
// function has the "trap-func-name" attribute.
|
|
case Intrinsic::trap:
|
|
case Intrinsic::debugtrap:
|
|
case Intrinsic::ubsantrap:
|
|
return CB.hasFnAttr(Attribute::NoCallback) ||
|
|
!CB.hasFnAttr("trap-func-name");
|
|
default:
|
|
// Some intrinsics may use AGPRs, but if we have a choice, we are not
|
|
// required to use AGPRs.
|
|
// Assume !nocallback intrinsics may call a function which requires
|
|
// AGPRs.
|
|
return CB.hasFnAttr(Attribute::NoCallback);
|
|
}
|
|
|
|
// TODO: Handle callsite attributes
|
|
auto *CBEdges = A.getAAFor<AACallEdges>(
|
|
*this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
|
|
if (!CBEdges || CBEdges->hasUnknownCallee()) {
|
|
Maximum.indicatePessimisticFixpoint();
|
|
return false;
|
|
}
|
|
|
|
for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
|
|
const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
|
|
*this, IRPosition::function(*PossibleCallee), DepClassTy::REQUIRED);
|
|
if (!CalleeInfo || !CalleeInfo->isValidState()) {
|
|
Maximum.indicatePessimisticFixpoint();
|
|
return false;
|
|
}
|
|
|
|
Maximum.takeAssumedMaximum(CalleeInfo->getAssumed());
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
bool UsedAssumedInformation = false;
|
|
if (!A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *this,
|
|
UsedAssumedInformation))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return clampStateAndIndicateChange(getState(), Maximum);
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
LLVMContext &Ctx = getAssociatedFunction()->getContext();
|
|
SmallString<4> Buffer;
|
|
raw_svector_ostream OS(Buffer);
|
|
OS << getAssumed();
|
|
|
|
return A.manifestAttrs(
|
|
getIRPosition(), {Attribute::get(Ctx, "amdgpu-agpr-alloc", OS.str())});
|
|
}
|
|
|
|
StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDGPUMinAGPRAllocs
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDGPUMinAGPRAlloc::ID = 0;
|
|
|
|
/// An abstract attribute to propagate the function attribute
|
|
/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
|
|
struct AAAMDGPUClusterDims
|
|
: public StateWrapper<BooleanState, AbstractAttribute> {
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName().
|
|
StringRef getName() const override { return "AAAMDGPUClusterDims"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDGPUClusterDims.
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return AA->getIdAddr() == &ID;
|
|
}
|
|
|
|
virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDGPUClusterDims::ID = 0;
|
|
|
|
struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
|
|
AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
|
|
: AAAMDGPUClusterDims(IRP, A) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
assert(F && "empty associated function");
|
|
|
|
Attr = AMDGPU::ClusterDimsAttr::get(*F);
|
|
|
|
// No matter what a kernel function has, it is final.
|
|
if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
|
|
if (Attr.isUnknown())
|
|
indicatePessimisticFixpoint();
|
|
else
|
|
indicateOptimisticFixpoint();
|
|
}
|
|
}
|
|
|
|
const std::string getAsStr(Attributor *A) const override {
|
|
if (!getAssumed() || Attr.isUnknown())
|
|
return "unknown";
|
|
if (Attr.isNoCluster())
|
|
return "no";
|
|
if (Attr.isVariableDims())
|
|
return "variable";
|
|
return Attr.to_string();
|
|
}
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
auto OldState = Attr;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
|
|
*this, IRPosition::function(*CS.getInstruction()->getFunction()),
|
|
DepClassTy::REQUIRED);
|
|
if (!CallerAA || !CallerAA->isValidState())
|
|
return false;
|
|
|
|
return merge(CallerAA->getClusterDims());
|
|
};
|
|
|
|
bool UsedAssumedInformation = false;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this,
|
|
/*RequireAllCallSites=*/true,
|
|
UsedAssumedInformation))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
if (Attr.isUnknown())
|
|
return ChangeStatus::UNCHANGED;
|
|
return A.manifestAttrs(
|
|
getIRPosition(),
|
|
{Attribute::get(getAssociatedFunction()->getContext(), AttrName,
|
|
Attr.to_string())},
|
|
/*ForceReplace=*/true);
|
|
}
|
|
|
|
const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
|
|
return Attr;
|
|
}
|
|
|
|
private:
|
|
bool merge(const AMDGPU::ClusterDimsAttr &Other) {
|
|
// Case 1: Both of them are unknown yet, we do nothing and continue wait for
|
|
// propagation.
|
|
if (Attr.isUnknown() && Other.isUnknown())
|
|
return true;
|
|
|
|
// Case 2: The other is determined, but we are unknown yet, we simply take
|
|
// the other's value.
|
|
if (Attr.isUnknown()) {
|
|
Attr = Other;
|
|
return true;
|
|
}
|
|
|
|
// Case 3: We are determined but the other is unknown yet, we simply keep
|
|
// everything unchanged.
|
|
if (Other.isUnknown())
|
|
return true;
|
|
|
|
// After this point, both are determined.
|
|
|
|
// Case 4: If they are same, we do nothing.
|
|
if (Attr == Other)
|
|
return true;
|
|
|
|
// Now they are not same.
|
|
|
|
// Case 5: If either of us uses cluster (but not both; otherwise case 4
|
|
// would hold), then it is unknown whether cluster will be used, and the
|
|
// state is final, unlike case 1.
|
|
if (Attr.isNoCluster() || Other.isNoCluster()) {
|
|
Attr.setUnknown();
|
|
return false;
|
|
}
|
|
|
|
// Case 6: Both of us use cluster, but the dims are different, so the result
|
|
// is, cluster is used, but we just don't have a fixed dims.
|
|
Attr.setVariableDims();
|
|
return true;
|
|
}
|
|
|
|
AMDGPU::ClusterDimsAttr Attr;
|
|
|
|
static constexpr char AttrName[] = "amdgpu-cluster-dims";
|
|
};
|
|
|
|
AAAMDGPUClusterDims &
|
|
AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
|
|
llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
|
|
}
|
|
|
|
static bool runImpl(SetVector<Function *> &Functions, bool IsModulePass,
|
|
bool DeleteFns, Module &M, AnalysisGetter &AG,
|
|
TargetMachine &TM, AMDGPUAttributorOptions Options,
|
|
ThinOrFullLTOPhase LTOPhase) {
|
|
|
|
CallGraphUpdater CGUpdater;
|
|
BumpPtrAllocator Allocator;
|
|
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
|
|
DenseSet<const char *> Allowed(
|
|
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
|
|
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
|
|
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
|
|
&AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
|
|
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
|
|
&AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
|
|
&AAAMDGPUClusterDims::ID, &AAAlign::ID});
|
|
|
|
AttributorConfig AC(CGUpdater);
|
|
AC.IsClosedWorldModule = Options.IsClosedWorld;
|
|
AC.Allowed = &Allowed;
|
|
AC.IsModulePass = IsModulePass;
|
|
AC.DeleteFns = DeleteFns;
|
|
AC.DefaultInitializeLiveInternals = false;
|
|
AC.IndirectCalleeSpecializationCallback =
|
|
[](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
|
|
Function &Callee, unsigned NumAssumedCallees) {
|
|
return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
|
|
(NumAssumedCallees <= IndirectCallSpecializationThreshold);
|
|
};
|
|
AC.IPOAmendableCB = [](const Function &F) {
|
|
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
|
|
};
|
|
|
|
Attributor A(Functions, InfoCache, AC);
|
|
|
|
LLVM_DEBUG({
|
|
StringRef LTOPhaseStr = to_string(LTOPhase);
|
|
dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
|
|
<< "[AMDGPUAttributor] Module " << M.getName() << " is "
|
|
<< (AC.IsClosedWorldModule ? "" : "not ")
|
|
<< "assumed to be a closed world.\n";
|
|
});
|
|
|
|
for (auto *F : Functions) {
|
|
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
|
|
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
|
|
A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
|
|
CallingConv::ID CC = F->getCallingConv();
|
|
if (!AMDGPU::isEntryFunctionCC(CC)) {
|
|
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
|
|
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
|
|
}
|
|
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
|
|
if (!F->isDeclaration() && ST.hasClusters())
|
|
A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
|
|
|
|
if (ST.hasGFX90AInsts())
|
|
A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRPosition::function(*F));
|
|
|
|
for (auto &I : instructions(F)) {
|
|
Value *Ptr = nullptr;
|
|
if (auto *LI = dyn_cast<LoadInst>(&I))
|
|
Ptr = LI->getPointerOperand();
|
|
else if (auto *SI = dyn_cast<StoreInst>(&I))
|
|
Ptr = SI->getPointerOperand();
|
|
else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
|
|
Ptr = RMW->getPointerOperand();
|
|
else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
|
|
Ptr = CmpX->getPointerOperand();
|
|
|
|
if (Ptr) {
|
|
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
|
|
A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
|
|
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
|
|
if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
|
|
A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return A.run() == ChangeStatus::CHANGED;
|
|
}
|
|
} // namespace
|
|
|
|
PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
|
|
ModuleAnalysisManager &AM) {
|
|
|
|
FunctionAnalysisManager &FAM =
|
|
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
|
AnalysisGetter AG(FAM);
|
|
|
|
SetVector<Function *> Functions;
|
|
for (Function &F : M) {
|
|
if (!F.isDeclaration())
|
|
Functions.insert(&F);
|
|
}
|
|
|
|
// TODO: Probably preserves CFG
|
|
return runImpl(Functions, /*IsModulePass=*/true, /*DeleteFns=*/true, M, AG,
|
|
TM, Options, LTOPhase)
|
|
? PreservedAnalyses::none()
|
|
: PreservedAnalyses::all();
|
|
}
|
|
|
|
PreservedAnalyses llvm::AMDGPUAttributorCGSCCPass::run(LazyCallGraph::SCC &C,
|
|
CGSCCAnalysisManager &AM,
|
|
LazyCallGraph &CG,
|
|
CGSCCUpdateResult &UR) {
|
|
|
|
FunctionAnalysisManager &FAM =
|
|
AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
|
|
AnalysisGetter AG(FAM);
|
|
|
|
SetVector<Function *> Functions;
|
|
for (LazyCallGraph::Node &N : C) {
|
|
Function *F = &N.getFunction();
|
|
if (!F->isIntrinsic())
|
|
Functions.insert(F);
|
|
}
|
|
|
|
AMDGPUAttributorOptions Options;
|
|
Module *M = C.begin()->getFunction().getParent();
|
|
// In the CGSCC pipeline, avoid untracked call graph modifications by
|
|
// disabling function deletion, mirroring the generic AttributorCGSCCPass.
|
|
return runImpl(Functions, /*IsModulePass=*/false, /*DeleteFns=*/false, *M, AG,
|
|
TM, Options, ThinOrFullLTOPhase::None)
|
|
? PreservedAnalyses::none()
|
|
: PreservedAnalyses::all();
|
|
}
|