llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

//===- AMDGPUAttributor.cpp -----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"

#define DEBUG_TYPE "amdgpu-attributor"

using namespace llvm;

static cl::opt<unsigned> IndirectCallSpecializationThreshold(
    "amdgpu-indirect-call-specialization-threshold",
    cl::desc(
        "A threshold controls whether an indirect call will be specialized"),
    cl::init(3));

#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,

enum ImplicitArgumentPositions {
#include "AMDGPUAttributes.def"
  LAST_ARG_POS
};

#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,

enum ImplicitArgumentMask {
  UNKNOWN_INTRINSIC = 0,
#include "AMDGPUAttributes.def"
  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
  NOT_IMPLICIT_INPUT
};

#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
    ImplicitAttrs[] = {
#include "AMDGPUAttributes.def"
};

// We do not need to note the x workitem or workgroup id because they are always
// initialized.
//
// TODO: We should not add the attributes if the known compile time workgroup
// size is 1 for y/z.
static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
                    bool HasApertureRegs, bool SupportsGetDoorBellID,
                    unsigned CodeObjectVersion) {
  switch (ID) {
  case Intrinsic::amdgcn_workitem_id_x:
    NonKernelOnly = true;
    return WORKITEM_ID_X;
  case Intrinsic::amdgcn_workgroup_id_x:
    NonKernelOnly = true;
    return WORKGROUP_ID_X;
  case Intrinsic::amdgcn_workitem_id_y:
  case Intrinsic::r600_read_tidig_y:
    return WORKITEM_ID_Y;
  case Intrinsic::amdgcn_workitem_id_z:
  case Intrinsic::r600_read_tidig_z:
    return WORKITEM_ID_Z;
  case Intrinsic::amdgcn_workgroup_id_y:
  case Intrinsic::r600_read_tgid_y:
    return WORKGROUP_ID_Y;
  case Intrinsic::amdgcn_workgroup_id_z:
  case Intrinsic::r600_read_tgid_z:
    return WORKGROUP_ID_Z;
  case Intrinsic::amdgcn_cluster_id_x:
    NonKernelOnly = true;
    return CLUSTER_ID_X;
  case Intrinsic::amdgcn_cluster_id_y:
    return CLUSTER_ID_Y;
  case Intrinsic::amdgcn_cluster_id_z:
    return CLUSTER_ID_Z;
  case Intrinsic::amdgcn_lds_kernel_id:
    return LDS_KERNEL_ID;
  case Intrinsic::amdgcn_dispatch_ptr:
    return DISPATCH_PTR;
  case Intrinsic::amdgcn_dispatch_id:
    return DISPATCH_ID;
  case Intrinsic::amdgcn_implicitarg_ptr:
    return IMPLICIT_ARG_PTR;
  // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
  // queue_ptr.
  case Intrinsic::amdgcn_queue_ptr:
    NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
    return QUEUE_PTR;
  case Intrinsic::amdgcn_is_shared:
  case Intrinsic::amdgcn_is_private:
    if (HasApertureRegs)
      return NOT_IMPLICIT_INPUT;
    // Under V5, we need implicitarg_ptr + offsets to access private_base or
    // shared_base. For pre-V5, however, need to access them through queue_ptr +
    // offsets.
    return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
                                                    : QUEUE_PTR;
  case Intrinsic::amdgcn_wwm:
  case Intrinsic::amdgcn_strict_wwm:
    return WHOLE_WAVE_MODE;
  case Intrinsic::trap:
  case Intrinsic::debugtrap:
  case Intrinsic::ubsantrap:
    if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
      return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
                                                      : QUEUE_PTR;
    NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
    return QUEUE_PTR;
  default:
    return UNKNOWN_INTRINSIC;
  }
}

static bool castRequiresQueuePtr(unsigned SrcAS) {
  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
}

static bool isDSAddress(const Constant *C) {
  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
  if (!GV)
    return false;
  unsigned AS = GV->getAddressSpace();
  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
}

/// Returns true if sanitizer attributes are present on a function.
static bool hasSanitizerAttributes(const Function &F) {
  return F.hasFnAttribute(Attribute::SanitizeAddress) ||
         F.hasFnAttribute(Attribute::SanitizeThread) ||
         F.hasFnAttribute(Attribute::SanitizeMemory) ||
         F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
         F.hasFnAttribute(Attribute::SanitizeMemTag);
}

namespace {
class AMDGPUInformationCache : public InformationCache {
public:
  AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
                         BumpPtrAllocator &Allocator,
                         SetVector<Function *> *CGSCC, TargetMachine &TM)
      : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
        CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}

  TargetMachine &TM;

  enum ConstantStatus : uint8_t {
    NONE = 0,
    DS_GLOBAL = 1 << 0,
    ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
    ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
    ADDR_SPACE_CAST_BOTH_TO_FLAT =
        ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
  };

  /// Check if the subtarget has aperture regs.
  bool hasApertureRegs(Function &F) {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return ST.hasApertureRegs();
  }

  /// Check if the subtarget supports GetDoorbellID.
  bool supportsGetDoorbellID(Function &F) {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return ST.supportsGetDoorbellID();
  }

  std::optional<std::pair<unsigned, unsigned>>
  getFlatWorkGroupSizeAttr(const Function &F) const {
    auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
    if (!R)
      return std::nullopt;
    return std::make_pair(R->first, *(R->second));
  }

  std::pair<unsigned, unsigned>
  getDefaultFlatWorkGroupSize(const Function &F) const {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
  }

  std::pair<unsigned, unsigned>
  getMaximumFlatWorkGroupRange(const Function &F) {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
  }

  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return ST.getMaxNumWorkGroups(F);
  }

  /// Get code object version.
  unsigned getCodeObjectVersion() const { return CodeObjectVersion; }

  std::optional<std::pair<unsigned, unsigned>>
  getWavesPerEUAttr(const Function &F) {
    auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
                                               /*OnlyFirstRequired=*/true);
    if (!Val)
      return std::nullopt;
    if (!Val->second) {
      const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
      Val->second = ST.getMaxWavesPerEU();
    }
    return std::make_pair(Val->first, *(Val->second));
  }

  unsigned getMaxWavesPerEU(const Function &F) {
    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
    return ST.getMaxWavesPerEU();
  }

  unsigned getMaxAddrSpace() const override {
    return AMDGPUAS::MAX_AMDGPU_ADDRESS;
  }

private:
  /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
  /// local to flat. These casts may require the queue pointer.
  static uint8_t visitConstExpr(const ConstantExpr *CE) {
    uint8_t Status = NONE;

    if (CE->getOpcode() == Instruction::AddrSpaceCast) {
      unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
      if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
        Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
      else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
        Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
    }

    return Status;
  }

  /// Get the constant access bitmap for \p C.
  uint8_t getConstantAccess(const Constant *C,
                            SmallPtrSetImpl<const Constant *> &Visited) {
    auto It = ConstantStatus.find(C);
    if (It != ConstantStatus.end())
      return It->second;

    uint8_t Result = 0;
    if (isDSAddress(C))
      Result = DS_GLOBAL;

    if (const auto *CE = dyn_cast<ConstantExpr>(C))
      Result |= visitConstExpr(CE);

    for (const Use &U : C->operands()) {
      const auto *OpC = dyn_cast<Constant>(U);
      if (!OpC || !Visited.insert(OpC).second)
        continue;

      Result |= getConstantAccess(OpC, Visited);
    }
    return Result;
  }

public:
  /// Returns true if \p Fn needs the queue pointer because of \p C.
  bool needsQueuePtr(const Constant *C, Function &Fn) {
    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
    bool HasAperture = hasApertureRegs(Fn);

    // No need to explore the constants.
    if (!IsNonEntryFunc && HasAperture)
      return false;

    SmallPtrSet<const Constant *, 8> Visited;
    uint8_t Access = getConstantAccess(C, Visited);

    // We need to trap on DS globals in non-entry functions.
    if (IsNonEntryFunc && (Access & DS_GLOBAL))
      return true;

    return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
  }

  bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
    SmallPtrSet<const Constant *, 8> Visited;
    uint8_t Access = getConstantAccess(C, Visited);
    return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
  }

private:
  /// Used to determine if the Constant needs the queue pointer.
  DenseMap<const Constant *, uint8_t> ConstantStatus;
  const unsigned CodeObjectVersion;
};

struct AAAMDAttributes
    : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
                          AbstractAttribute> {
  using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
                            AbstractAttribute>;

  AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

  /// Create an abstract attribute view for the position \p IRP.
  static AAAMDAttributes &createForPosition(const IRPosition &IRP,
                                            Attributor &A);

  /// See AbstractAttribute::getName().
  StringRef getName() const override { return "AAAMDAttributes"; }

  /// See AbstractAttribute::getIdAddr().
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDAttributes.
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  /// Unique ID (due to the unique address)
  static const char ID;
};
const char AAAMDAttributes::ID = 0;

struct AAUniformWorkGroupSize
    : public StateWrapper<BooleanState, AbstractAttribute> {
  using Base = StateWrapper<BooleanState, AbstractAttribute>;
  AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

  /// Create an abstract attribute view for the position \p IRP.
  static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
                                                   Attributor &A);

  /// See AbstractAttribute::getName().
  StringRef getName() const override { return "AAUniformWorkGroupSize"; }

  /// See AbstractAttribute::getIdAddr().
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDAttributes.
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  /// Unique ID (due to the unique address)
  static const char ID;
};
const char AAUniformWorkGroupSize::ID = 0;

struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
  AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
      : AAUniformWorkGroupSize(IRP, A) {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    CallingConv::ID CC = F->getCallingConv();

    if (CC != CallingConv::AMDGPU_KERNEL)
      return;

    bool InitialValue = F->hasFnAttribute("uniform-work-group-size");

    if (InitialValue)
      indicateOptimisticFixpoint();
    else
      indicatePessimisticFixpoint();
  }

  ChangeStatus updateImpl(Attributor &A) override {
    ChangeStatus Change = ChangeStatus::UNCHANGED;

    auto CheckCallSite = [&](AbstractCallSite CS) {
      Function *Caller = CS.getInstruction()->getFunction();
      LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
                        << "->" << getAssociatedFunction()->getName() << "\n");

      const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
      if (!CallerInfo || !CallerInfo->isValidState())
        return false;

      Change = Change | clampStateAndIndicateChange(this->getState(),
                                                    CallerInfo->getState());

      return true;
    };

    bool AllCallSitesKnown = true;
    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
      return indicatePessimisticFixpoint();

    return Change;
  }

  ChangeStatus manifest(Attributor &A) override {
    if (!getAssumed())
      return ChangeStatus::UNCHANGED;

    LLVMContext &Ctx = getAssociatedFunction()->getContext();
    return A.manifestAttrs(getIRPosition(),
                           {Attribute::get(Ctx, "uniform-work-group-size")},
                           /*ForceReplace=*/true);
  }

  bool isValidState() const override {
    // This state is always valid, even when the state is false.
    return true;
  }

  const std::string getAsStr(Attributor *) const override {
    return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
  }

  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}
};

AAUniformWorkGroupSize &
AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
                                          Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
  llvm_unreachable(
      "AAUniformWorkGroupSize is only valid for function position");
}

struct AAAMDAttributesFunction : public AAAMDAttributes {
  AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
      : AAAMDAttributes(IRP, A) {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();

    // If the function requires the implicit arg pointer due to sanitizers,
    // assume it's needed even if explicitly marked as not requiring it.
    // Flat scratch initialization is needed because `asan_malloc_impl`
    // calls introduced later in pipeline will have flat scratch accesses.
    // FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
    // implementation for `asan_malloc_impl` is updated.
    const bool HasSanitizerAttrs = hasSanitizerAttributes(*F);
    if (HasSanitizerAttrs) {
      removeAssumedBits(IMPLICIT_ARG_PTR);
      removeAssumedBits(HOSTCALL_PTR);
      removeAssumedBits(FLAT_SCRATCH_INIT);
    }

    for (auto Attr : ImplicitAttrs) {
      if (HasSanitizerAttrs &&
          (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
           Attr.first == FLAT_SCRATCH_INIT))
        continue;

      if (F->hasFnAttribute(Attr.second))
        addKnownBits(Attr.first);
    }

    if (F->isDeclaration())
      return;

    // Ignore functions with graphics calling conventions, these are currently
    // not allowed to have kernel arguments.
    if (AMDGPU::isGraphics(F->getCallingConv())) {
      indicatePessimisticFixpoint();
      return;
    }
  }

  ChangeStatus updateImpl(Attributor &A) override {
    Function *F = getAssociatedFunction();
    // The current assumed state used to determine a change.
    auto OrigAssumed = getAssumed();

    // Check for Intrinsics and propagate attributes.
    const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
        *this, this->getIRPosition(), DepClassTy::REQUIRED);
    if (!AAEdges || !AAEdges->isValidState() ||
        AAEdges->hasNonAsmUnknownCallee())
      return indicatePessimisticFixpoint();

    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());

    bool NeedsImplicit = false;
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
    bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
    bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
    unsigned COV = InfoCache.getCodeObjectVersion();

    for (Function *Callee : AAEdges->getOptimisticEdges()) {
      Intrinsic::ID IID = Callee->getIntrinsicID();
      if (IID == Intrinsic::not_intrinsic) {
        const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
            *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
        if (!AAAMD || !AAAMD->isValidState())
          return indicatePessimisticFixpoint();
        *this &= *AAAMD;
        continue;
      }

      bool NonKernelOnly = false;
      ImplicitArgumentMask AttrMask =
          intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                              HasApertureRegs, SupportsGetDoorbellID, COV);

      if (AttrMask == UNKNOWN_INTRINSIC) {
        // Assume not-nocallback intrinsics may invoke a function which accesses
        // implicit arguments.
        //
        // FIXME: This isn't really the correct check. We want to ensure it
        // isn't calling any function that may use implicit arguments regardless
        // of whether it's internal to the module or not.
        //
        // TODO: Ignoring callsite attributes.
        if (!Callee->hasFnAttribute(Attribute::NoCallback))
          return indicatePessimisticFixpoint();
        continue;
      }

      if (AttrMask != NOT_IMPLICIT_INPUT) {
        if ((IsNonEntryFunc || !NonKernelOnly))
          removeAssumedBits(AttrMask);
      }
    }

    // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
    if (NeedsImplicit)
      removeAssumedBits(IMPLICIT_ARG_PTR);

    if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
      // Under V5, we need implicitarg_ptr + offsets to access private_base or
      // shared_base. We do not actually need queue_ptr.
      if (COV >= 5)
        removeAssumedBits(IMPLICIT_ARG_PTR);
      else
        removeAssumedBits(QUEUE_PTR);
    }

    if (funcRetrievesMultigridSyncArg(A, COV)) {
      assert(!isAssumed(IMPLICIT_ARG_PTR) &&
             "multigrid_sync_arg needs implicitarg_ptr");
      removeAssumedBits(MULTIGRID_SYNC_ARG);
    }

    if (funcRetrievesHostcallPtr(A, COV)) {
      assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
      removeAssumedBits(HOSTCALL_PTR);
    }

    if (funcRetrievesHeapPtr(A, COV)) {
      assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
      removeAssumedBits(HEAP_PTR);
    }

    if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
      assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
      removeAssumedBits(QUEUE_PTR);
    }

    if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
      removeAssumedBits(LDS_KERNEL_ID);
    }

    if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
      removeAssumedBits(DEFAULT_QUEUE);

    if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
      removeAssumedBits(COMPLETION_ACTION);

    if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
      removeAssumedBits(FLAT_SCRATCH_INIT);

    return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
                                       : ChangeStatus::UNCHANGED;
  }

  ChangeStatus manifest(Attributor &A) override {
    SmallVector<Attribute, 8> AttrList;
    LLVMContext &Ctx = getAssociatedFunction()->getContext();

    for (auto Attr : ImplicitAttrs) {
      if (isKnown(Attr.first))
        AttrList.push_back(Attribute::get(Ctx, Attr.second));
    }

    return A.manifestAttrs(getIRPosition(), AttrList,
                           /* ForceReplace */ true);
  }

  const std::string getAsStr(Attributor *) const override {
    std::string Str;
    raw_string_ostream OS(Str);
    OS << "AMDInfo[";
    for (auto Attr : ImplicitAttrs)
      if (isAssumed(Attr.first))
        OS << ' ' << Attr.second;
    OS << " ]";
    return OS.str();
  }

  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}

private:
  bool checkForQueuePtr(Attributor &A) {
    Function *F = getAssociatedFunction();
    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());

    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

    bool NeedsQueuePtr = false;

    auto CheckAddrSpaceCasts = [&](Instruction &I) {
      unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
      if (castRequiresQueuePtr(SrcAS)) {
        NeedsQueuePtr = true;
        return false;
      }
      return true;
    };

    bool HasApertureRegs = InfoCache.hasApertureRegs(*F);

    // `checkForAllInstructions` is much more cheaper than going through all
    // instructions, try it first.

    // The queue pointer is not needed if aperture regs is present.
    if (!HasApertureRegs) {
      bool UsedAssumedInformation = false;
      A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
                                {Instruction::AddrSpaceCast},
                                UsedAssumedInformation);
    }

    // If we found  that we need the queue pointer, nothing else to do.
    if (NeedsQueuePtr)
      return true;

    if (!IsNonEntryFunc && HasApertureRegs)
      return false;

    for (BasicBlock &BB : *F) {
      for (Instruction &I : BB) {
        for (const Use &U : I.operands()) {
          if (const auto *C = dyn_cast<Constant>(U)) {
            if (InfoCache.needsQueuePtr(C, *F))
              return true;
          }
        }
      }
    }

    return false;
  }

  bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
    auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
    AA::RangeTy Range(Pos, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
    auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
    AA::RangeTy Range(Pos, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
    auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
    AA::RangeTy Range(Pos, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
    auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
    AA::RangeTy Range(Pos, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
    if (COV < 5)
      return false;
    AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
    if (COV < 5)
      return false;
    AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
    return funcRetrievesImplicitKernelArg(A, Range);
  }

  bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
    // Check if this is a call to the implicitarg_ptr builtin and it
    // is used to retrieve the hostcall pointer. The implicit arg for
    // hostcall is not used only if every use of the implicitarg_ptr
    // is a load that clearly does not retrieve any byte of the
    // hostcall pointer. We check this by tracing all the uses of the
    // initial call to the implicitarg_ptr intrinsic.
    auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
      auto &Call = cast<CallBase>(I);
      if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
        return true;

      const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
          *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
      if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
        return false;

      return PointerInfoAA->forallInterferingAccesses(
          Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
            return Acc.getRemoteInst()->isDroppable();
          });
    };

    bool UsedAssumedInformation = false;
    return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
                                              UsedAssumedInformation);
  }

  bool funcRetrievesLDSKernelId(Attributor &A) {
    auto DoesNotRetrieve = [&](Instruction &I) {
      auto &Call = cast<CallBase>(I);
      return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
    };
    bool UsedAssumedInformation = false;
    return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
                                              UsedAssumedInformation);
  }

  // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
  // not to be set.
  bool needFlatScratchInit(Attributor &A) {
    assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set

    // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
    // there is a cast from PRIVATE_ADDRESS.
    auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
      return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
             AMDGPUAS::PRIVATE_ADDRESS;
    };

    bool UsedAssumedInformation = false;
    if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
                                   {Instruction::AddrSpaceCast},
                                   UsedAssumedInformation))
      return true;

    // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

    Function *F = getAssociatedFunction();
    for (Instruction &I : instructions(F)) {
      for (const Use &U : I.operands()) {
        if (const auto *C = dyn_cast<Constant>(U)) {
          if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
            return true;
        }
      }
    }

    // Finally check callees.

    // This is called on each callee; false means callee shouldn't have
    // no-flat-scratch-init.
    auto CheckForNoFlatScratchInit = [&](Instruction &I) {
      const auto &CB = cast<CallBase>(I);
      const Function *Callee = CB.getCalledFunction();

      // Callee == 0 for inline asm or indirect call with known callees.
      // In the latter case, updateImpl() already checked the callees and we
      // know their FLAT_SCRATCH_INIT bit is set.
      // If function has indirect call with unknown callees, the bit is
      // already removed in updateImpl() and execution won't reach here.
      if (!Callee)
        return true;

      return Callee->getIntrinsicID() !=
             Intrinsic::amdgcn_addrspacecast_nonnull;
    };

    UsedAssumedInformation = false;
    // If any callee is false (i.e. need FlatScratchInit),
    // checkForAllCallLikeInstructions returns false, in which case this
    // function returns true.
    return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                              UsedAssumedInformation);
  }
};

AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
                                                    Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
  llvm_unreachable("AAAMDAttributes is only valid for function position");
}

/// Base class to derive different size ranges.
struct AAAMDSizeRangeAttribute
    : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
  using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;

  StringRef AttrName;

  AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
                          StringRef AttrName)
      : Base(IRP, 32), AttrName(AttrName) {}

  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}

  template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
    ChangeStatus Change = ChangeStatus::UNCHANGED;

    auto CheckCallSite = [&](AbstractCallSite CS) {
      Function *Caller = CS.getInstruction()->getFunction();
      LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
                        << "->" << getAssociatedFunction()->getName() << '\n');

      const auto *CallerInfo = A.getAAFor<AttributeImpl>(
          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
      if (!CallerInfo || !CallerInfo->isValidState())
        return false;

      Change |=
          clampStateAndIndicateChange(this->getState(), CallerInfo->getState());

      return true;
    };

    bool AllCallSitesKnown = true;
    if (!A.checkForAllCallSites(CheckCallSite, *this,
                                /*RequireAllCallSites=*/true,
                                AllCallSitesKnown))
      return indicatePessimisticFixpoint();

    return Change;
  }

  /// Clamp the assumed range to the default value ([Min, Max]) and emit the
  /// attribute if it is not same as default.
  ChangeStatus
  emitAttributeIfNotDefaultAfterClamp(Attributor &A,
                                      std::pair<unsigned, unsigned> Default) {
    auto [Min, Max] = Default;
    unsigned Lower = getAssumed().getLower().getZExtValue();
    unsigned Upper = getAssumed().getUpper().getZExtValue();

    // Clamp the range to the default value.
    if (Lower < Min)
      Lower = Min;
    if (Upper > Max + 1)
      Upper = Max + 1;

    // No manifest if the value is invalid or same as default after clamp.
    if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
      return ChangeStatus::UNCHANGED;

    Function *F = getAssociatedFunction();
    LLVMContext &Ctx = F->getContext();
    SmallString<10> Buffer;
    raw_svector_ostream OS(Buffer);
    OS << Lower << ',' << Upper - 1;
    return A.manifestAttrs(getIRPosition(),
                           {Attribute::get(Ctx, AttrName, OS.str())},
                           /*ForceReplace=*/true);
  }

  const std::string getAsStr(Attributor *) const override {
    std::string Str;
    raw_string_ostream OS(Str);
    OS << getName() << '[';
    OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
    OS << ']';
    return OS.str();
  }
};

/// Propagate amdgpu-flat-work-group-size attribute.
struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
  AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
      : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

    bool HasAttr = false;
    auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
    auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);

    if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
      // We only consider an attribute that is not max range because the front
      // end always emits the attribute, unfortunately, and sometimes it emits
      // the max range.
      if (*Attr != MaxRange) {
        Range = *Attr;
        HasAttr = true;
      }
    }

    // We don't want to directly clamp the state if it's the max range because
    // that is basically the worst state.
    if (Range == MaxRange)
      return;

    auto [Min, Max] = Range;
    ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
    IntegerRangeState IRS(CR);
    clampStateAndIndicateChange(this->getState(), IRS);

    if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
      indicateOptimisticFixpoint();
  }

  ChangeStatus updateImpl(Attributor &A) override {
    return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
  }

  /// Create an abstract attribute view for the position \p IRP.
  static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
                                                   Attributor &A);

  ChangeStatus manifest(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
    return emitAttributeIfNotDefaultAfterClamp(
        A, InfoCache.getMaximumFlatWorkGroupRange(*F));
  }

  /// See AbstractAttribute::getName()
  StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }

  /// See AbstractAttribute::getIdAddr()
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDFlatWorkGroupSize
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  /// Unique ID (due to the unique address)
  static const char ID;
};

const char AAAMDFlatWorkGroupSize::ID = 0;

AAAMDFlatWorkGroupSize &
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
                                          Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
  llvm_unreachable(
      "AAAMDFlatWorkGroupSize is only valid for function position");
}

struct TupleDecIntegerRangeState : public AbstractState {
  DecIntegerState<uint32_t> X, Y, Z;

  bool isValidState() const override {
    return X.isValidState() && Y.isValidState() && Z.isValidState();
  }

  bool isAtFixpoint() const override {
    return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
  }

  ChangeStatus indicateOptimisticFixpoint() override {
    return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
           Z.indicateOptimisticFixpoint();
  }

  ChangeStatus indicatePessimisticFixpoint() override {
    return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
           Z.indicatePessimisticFixpoint();
  }

  TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
    X ^= Other.X;
    Y ^= Other.Y;
    Z ^= Other.Z;
    return *this;
  }

  bool operator==(const TupleDecIntegerRangeState &Other) const {
    return X == Other.X && Y == Other.Y && Z == Other.Z;
  }

  TupleDecIntegerRangeState &getAssumed() { return *this; }
  const TupleDecIntegerRangeState &getAssumed() const { return *this; }
};

using AAAMDMaxNumWorkgroupsState =
    StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;

/// Propagate amdgpu-max-num-workgroups attribute.
struct AAAMDMaxNumWorkgroups
    : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
  using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;

  AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

    SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);

    X.takeKnownMinimum(MaxNumWorkgroups[0]);
    Y.takeKnownMinimum(MaxNumWorkgroups[1]);
    Z.takeKnownMinimum(MaxNumWorkgroups[2]);

    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
      indicatePessimisticFixpoint();
  }

  ChangeStatus updateImpl(Attributor &A) override {
    ChangeStatus Change = ChangeStatus::UNCHANGED;

    auto CheckCallSite = [&](AbstractCallSite CS) {
      Function *Caller = CS.getInstruction()->getFunction();
      LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
                        << "->" << getAssociatedFunction()->getName() << '\n');

      const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
      if (!CallerInfo || !CallerInfo->isValidState())
        return false;

      Change |=
          clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
      return true;
    };

    bool AllCallSitesKnown = true;
    if (!A.checkForAllCallSites(CheckCallSite, *this,
                                /*RequireAllCallSites=*/true,
                                AllCallSitesKnown))
      return indicatePessimisticFixpoint();

    return Change;
  }

  /// Create an abstract attribute view for the position \p IRP.
  static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
                                                  Attributor &A);

  ChangeStatus manifest(Attributor &A) override {
    Function *F = getAssociatedFunction();
    LLVMContext &Ctx = F->getContext();
    SmallString<32> Buffer;
    raw_svector_ostream OS(Buffer);
    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();

    // TODO: Should annotate loads of the group size for this to do anything
    // useful.
    return A.manifestAttrs(
        getIRPosition(),
        {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
        /* ForceReplace= */ true);
  }

  StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }

  const std::string getAsStr(Attributor *) const override {
    std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
    raw_string_ostream OS(Buffer);
    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
       << ']';
    return OS.str();
  }

  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDMaxNumWorkgroups
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  void trackStatistics() const override {}

  /// Unique ID (due to the unique address)
  static const char ID;
};

const char AAAMDMaxNumWorkgroups::ID = 0;

AAAMDMaxNumWorkgroups &
AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
  llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
}

/// Propagate amdgpu-waves-per-eu attribute.
struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
  AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
      : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

    // If the attribute exists, we will honor it if it is not the default.
    if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
      std::pair<unsigned, unsigned> MaxWavesPerEURange{
          1U, InfoCache.getMaxWavesPerEU(*F)};
      if (*Attr != MaxWavesPerEURange) {
        auto [Min, Max] = *Attr;
        ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
        IntegerRangeState RangeState(Range);
        this->getState() = RangeState;
        indicateOptimisticFixpoint();
        return;
      }
    }

    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
      indicatePessimisticFixpoint();
  }

  ChangeStatus updateImpl(Attributor &A) override {
    ChangeStatus Change = ChangeStatus::UNCHANGED;

    auto CheckCallSite = [&](AbstractCallSite CS) {
      Function *Caller = CS.getInstruction()->getFunction();
      Function *Func = getAssociatedFunction();
      LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
                        << "->" << Func->getName() << '\n');
      (void)Func;

      const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
      if (!CallerAA || !CallerAA->isValidState())
        return false;

      ConstantRange Assumed = getAssumed();
      unsigned Min = std::max(Assumed.getLower().getZExtValue(),
                              CallerAA->getAssumed().getLower().getZExtValue());
      unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
                              CallerAA->getAssumed().getUpper().getZExtValue());
      ConstantRange Range(APInt(32, Min), APInt(32, Max));
      IntegerRangeState RangeState(Range);
      getState() = RangeState;
      Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
                                      : ChangeStatus::CHANGED;

      return true;
    };

    bool AllCallSitesKnown = true;
    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
      return indicatePessimisticFixpoint();

    return Change;
  }

  /// Create an abstract attribute view for the position \p IRP.
  static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
                                            Attributor &A);

  ChangeStatus manifest(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
    return emitAttributeIfNotDefaultAfterClamp(
        A, {1U, InfoCache.getMaxWavesPerEU(*F)});
  }

  /// See AbstractAttribute::getName()
  StringRef getName() const override { return "AAAMDWavesPerEU"; }

  /// See AbstractAttribute::getIdAddr()
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDWavesPerEU
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  /// Unique ID (due to the unique address)
  static const char ID;
};

const char AAAMDWavesPerEU::ID = 0;

AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
                                                    Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
  llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}

/// Compute the minimum number of AGPRs required to allocate the inline asm.
static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
                                             const CallBase &Call) {
  unsigned ArgNo = 0;
  unsigned ResNo = 0;
  unsigned AGPRDefCount = 0;
  unsigned AGPRUseCount = 0;
  unsigned MaxPhysReg = 0;
  const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();

  // TODO: Overestimates due to not accounting for tied operands
  for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
    Type *Ty = nullptr;
    switch (CI.Type) {
    case InlineAsm::isOutput: {
      Ty = Call.getType();
      if (auto *STy = dyn_cast<StructType>(Ty))
        Ty = STy->getElementType(ResNo);
      ++ResNo;
      break;
    }
    case InlineAsm::isInput: {
      Ty = Call.getArgOperand(ArgNo++)->getType();
      break;
    }
    case InlineAsm::isLabel:
      continue;
    case InlineAsm::isClobber:
      // Parse the physical register reference.
      break;
    }

    for (StringRef Code : CI.Codes) {
      unsigned RegCount = 0;
      if (Code.starts_with("a")) {
        // Virtual register, compute number of registers based on the type.
        //
        // We ought to be going through TargetLowering to get the number of
        // registers, but we should avoid the dependence on CodeGen here.
        RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
      } else {
        // Physical register reference
        auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
        if (Kind == 'a') {
          RegCount = NumRegs;
          MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
        }

        continue;
      }

      if (CI.Type == InlineAsm::isOutput) {
        // Apply tuple alignment requirement
        //
        // TODO: This is more conservative than necessary.
        AGPRDefCount = alignTo(AGPRDefCount, RegCount);

        AGPRDefCount += RegCount;
        if (CI.isEarlyClobber) {
          AGPRUseCount = alignTo(AGPRUseCount, RegCount);
          AGPRUseCount += RegCount;
        }
      } else {
        AGPRUseCount = alignTo(AGPRUseCount, RegCount);
        AGPRUseCount += RegCount;
      }
    }
  }

  unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);

  // TODO: This is overly conservative. If there are any physical registers,
  // allocate any virtual registers after them so we don't have to solve optimal
  // packing.
  return std::min(MaxVirtReg + MaxPhysReg, 256u);
}

struct AAAMDGPUMinAGPRAlloc
    : public StateWrapper<DecIntegerState<>, AbstractAttribute> {
  using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
  AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

  static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
                                                 Attributor &A) {
    if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
      return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
    llvm_unreachable(
        "AAAMDGPUMinAGPRAlloc is only valid for function position");
  }

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    auto [MinNumAGPR, MaxNumAGPR] =
        AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
                                        /*OnlyFirstRequired=*/true);
    if (MinNumAGPR == 0)
      indicateOptimisticFixpoint();
  }

  const std::string getAsStr(Attributor *A) const override {
    std::string Str = "amdgpu-agpr-alloc=";
    raw_string_ostream OS(Str);
    OS << getAssumed();
    return OS.str();
  }

  void trackStatistics() const override {}

  ChangeStatus updateImpl(Attributor &A) override {
    DecIntegerState<> Maximum;

    // Check for cases which require allocation of AGPRs. The only cases where
    // AGPRs are required are if there are direct references to AGPRs, so inline
    // assembly and special intrinsics.
    auto CheckForMinAGPRAllocs = [&](Instruction &I) {
      const auto &CB = cast<CallBase>(I);
      const Value *CalleeOp = CB.getCalledOperand();

      if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) {
        // Technically, the inline asm could be invoking a call to an unknown
        // external function that requires AGPRs, but ignore that.
        unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
        Maximum.takeAssumedMaximum(NumRegs);
        return true;
      }
      switch (CB.getIntrinsicID()) {
      case Intrinsic::not_intrinsic:
        break;
      case Intrinsic::write_register:
      case Intrinsic::read_register:
      case Intrinsic::read_volatile_register: {
        const MDString *RegName = cast<MDString>(
            cast<MDNode>(
                cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata())
                ->getOperand(0));
        auto [Kind, RegIdx, NumRegs] =
            AMDGPU::parseAsmPhysRegName(RegName->getString());
        if (Kind == 'a')
          Maximum.takeAssumedMaximum(std::min(RegIdx + NumRegs, 256u));

        return true;
      }
      // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
      // the nocallback attribute, so the AMDGPU attributor can conservatively
      // drop all implicitly-known inputs and AGPR allocation information. Make
      // sure we still infer that no implicit inputs are required and that the
      // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
      // function which requires AGPRs, so we need to check if the called
      // function has the "trap-func-name" attribute.
      case Intrinsic::trap:
      case Intrinsic::debugtrap:
      case Intrinsic::ubsantrap:
        return CB.hasFnAttr(Attribute::NoCallback) ||
               !CB.hasFnAttr("trap-func-name");
      default:
        // Some intrinsics may use AGPRs, but if we have a choice, we are not
        // required to use AGPRs.
        // Assume !nocallback intrinsics may call a function which requires
        // AGPRs.
        return CB.hasFnAttr(Attribute::NoCallback);
      }

      // TODO: Handle callsite attributes
      auto *CBEdges = A.getAAFor<AACallEdges>(
          *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
      if (!CBEdges || CBEdges->hasUnknownCallee()) {
        Maximum.indicatePessimisticFixpoint();
        return false;
      }

      for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
        const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
            *this, IRPosition::function(*PossibleCallee), DepClassTy::REQUIRED);
        if (!CalleeInfo || !CalleeInfo->isValidState()) {
          Maximum.indicatePessimisticFixpoint();
          return false;
        }

        Maximum.takeAssumedMaximum(CalleeInfo->getAssumed());
      }

      return true;
    };

    bool UsedAssumedInformation = false;
    if (!A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *this,
                                           UsedAssumedInformation))
      return indicatePessimisticFixpoint();

    return clampStateAndIndicateChange(getState(), Maximum);
  }

  ChangeStatus manifest(Attributor &A) override {
    LLVMContext &Ctx = getAssociatedFunction()->getContext();
    SmallString<4> Buffer;
    raw_svector_ostream OS(Buffer);
    OS << getAssumed();

    return A.manifestAttrs(
        getIRPosition(), {Attribute::get(Ctx, "amdgpu-agpr-alloc", OS.str())});
  }

  StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDGPUMinAGPRAllocs
  static bool classof(const AbstractAttribute *AA) {
    return (AA->getIdAddr() == &ID);
  }

  static const char ID;
};

const char AAAMDGPUMinAGPRAlloc::ID = 0;

/// An abstract attribute to propagate the function attribute
/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
struct AAAMDGPUClusterDims
    : public StateWrapper<BooleanState, AbstractAttribute> {
  using Base = StateWrapper<BooleanState, AbstractAttribute>;
  AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

  /// Create an abstract attribute view for the position \p IRP.
  static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
                                                Attributor &A);

  /// See AbstractAttribute::getName().
  StringRef getName() const override { return "AAAMDGPUClusterDims"; }

  /// See AbstractAttribute::getIdAddr().
  const char *getIdAddr() const override { return &ID; }

  /// This function should return true if the type of the \p AA is
  /// AAAMDGPUClusterDims.
  static bool classof(const AbstractAttribute *AA) {
    return AA->getIdAddr() == &ID;
  }

  virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;

  /// Unique ID (due to the unique address)
  static const char ID;
};

const char AAAMDGPUClusterDims::ID = 0;

struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
  AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
      : AAAMDGPUClusterDims(IRP, A) {}

  void initialize(Attributor &A) override {
    Function *F = getAssociatedFunction();
    assert(F && "empty associated function");

    Attr = AMDGPU::ClusterDimsAttr::get(*F);

    // No matter what a kernel function has, it is final.
    if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
      if (Attr.isUnknown())
        indicatePessimisticFixpoint();
      else
        indicateOptimisticFixpoint();
    }
  }

  const std::string getAsStr(Attributor *A) const override {
    if (!getAssumed() || Attr.isUnknown())
      return "unknown";
    if (Attr.isNoCluster())
      return "no";
    if (Attr.isVariableDims())
      return "variable";
    return Attr.to_string();
  }

  void trackStatistics() const override {}

  ChangeStatus updateImpl(Attributor &A) override {
    auto OldState = Attr;

    auto CheckCallSite = [&](AbstractCallSite CS) {
      const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
          *this, IRPosition::function(*CS.getInstruction()->getFunction()),
          DepClassTy::REQUIRED);
      if (!CallerAA || !CallerAA->isValidState())
        return false;

      return merge(CallerAA->getClusterDims());
    };

    bool UsedAssumedInformation = false;
    if (!A.checkForAllCallSites(CheckCallSite, *this,
                                /*RequireAllCallSites=*/true,
                                UsedAssumedInformation))
      return indicatePessimisticFixpoint();

    return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
  }

  ChangeStatus manifest(Attributor &A) override {
    if (Attr.isUnknown())
      return ChangeStatus::UNCHANGED;
    return A.manifestAttrs(
        getIRPosition(),
        {Attribute::get(getAssociatedFunction()->getContext(), AttrName,
                        Attr.to_string())},
        /*ForceReplace=*/true);
  }

  const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
    return Attr;
  }

private:
  bool merge(const AMDGPU::ClusterDimsAttr &Other) {
    // Case 1: Both of them are unknown yet, we do nothing and continue wait for
    // propagation.
    if (Attr.isUnknown() && Other.isUnknown())
      return true;

    // Case 2: The other is determined, but we are unknown yet, we simply take
    // the other's value.
    if (Attr.isUnknown()) {
      Attr = Other;
      return true;
    }

    // Case 3: We are determined but the other is unknown yet, we simply keep
    // everything unchanged.
    if (Other.isUnknown())
      return true;

    // After this point, both are determined.

    // Case 4: If they are same, we do nothing.
    if (Attr == Other)
      return true;

    // Now they are not same.

    // Case 5: If either of us uses cluster (but not both; otherwise case 4
    // would hold), then it is unknown whether cluster will be used, and the
    // state is final, unlike case 1.
    if (Attr.isNoCluster() || Other.isNoCluster()) {
      Attr.setUnknown();
      return false;
    }

    // Case 6: Both of us use cluster, but the dims are different, so the result
    // is, cluster is used, but we just don't have a fixed dims.
    Attr.setVariableDims();
    return true;
  }

  AMDGPU::ClusterDimsAttr Attr;

  static constexpr char AttrName[] = "amdgpu-cluster-dims";
};

AAAMDGPUClusterDims &
AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
    return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
  llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
}

static bool runImpl(SetVector<Function *> &Functions, bool IsModulePass,
                    bool DeleteFns, Module &M, AnalysisGetter &AG,
                    TargetMachine &TM, AMDGPUAttributorOptions Options,
                    ThinOrFullLTOPhase LTOPhase) {

  CallGraphUpdater CGUpdater;
  BumpPtrAllocator Allocator;
  AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
  DenseSet<const char *> Allowed(
      {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
       &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
       &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
       &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
       &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
       &AAAMDGPUClusterDims::ID, &AAAlign::ID});

  AttributorConfig AC(CGUpdater);
  AC.IsClosedWorldModule = Options.IsClosedWorld;
  AC.Allowed = &Allowed;
  AC.IsModulePass = IsModulePass;
  AC.DeleteFns = DeleteFns;
  AC.DefaultInitializeLiveInternals = false;
  AC.IndirectCalleeSpecializationCallback =
      [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
         Function &Callee, unsigned NumAssumedCallees) {
        return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
               (NumAssumedCallees <= IndirectCallSpecializationThreshold);
      };
  AC.IPOAmendableCB = [](const Function &F) {
    return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
  };

  Attributor A(Functions, InfoCache, AC);

  LLVM_DEBUG({
    StringRef LTOPhaseStr = to_string(LTOPhase);
    dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
           << "[AMDGPUAttributor] Module " << M.getName() << " is "
           << (AC.IsClosedWorldModule ? "" : "not ")
           << "assumed to be a closed world.\n";
  });

  for (auto *F : Functions) {
    A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
    A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
    A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
    CallingConv::ID CC = F->getCallingConv();
    if (!AMDGPU::isEntryFunctionCC(CC)) {
      A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
      A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
    }

    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
    if (!F->isDeclaration() && ST.hasClusters())
      A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));

    if (ST.hasGFX90AInsts())
      A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRPosition::function(*F));

    for (auto &I : instructions(F)) {
      Value *Ptr = nullptr;
      if (auto *LI = dyn_cast<LoadInst>(&I))
        Ptr = LI->getPointerOperand();
      else if (auto *SI = dyn_cast<StoreInst>(&I))
        Ptr = SI->getPointerOperand();
      else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
        Ptr = RMW->getPointerOperand();
      else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
        Ptr = CmpX->getPointerOperand();

      if (Ptr) {
        A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
        A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
          if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
            A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr));
        }
      }
    }
  }

  return A.run() == ChangeStatus::CHANGED;
}
} // namespace

PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
                                                  ModuleAnalysisManager &AM) {

  FunctionAnalysisManager &FAM =
      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
  AnalysisGetter AG(FAM);

  SetVector<Function *> Functions;
  for (Function &F : M) {
    if (!F.isDeclaration())
      Functions.insert(&F);
  }

  // TODO: Probably preserves CFG
  return runImpl(Functions, /*IsModulePass=*/true, /*DeleteFns=*/true, M, AG,
                 TM, Options, LTOPhase)
             ? PreservedAnalyses::none()
             : PreservedAnalyses::all();
}

PreservedAnalyses llvm::AMDGPUAttributorCGSCCPass::run(LazyCallGraph::SCC &C,
                                                       CGSCCAnalysisManager &AM,
                                                       LazyCallGraph &CG,
                                                       CGSCCUpdateResult &UR) {

  FunctionAnalysisManager &FAM =
      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
  AnalysisGetter AG(FAM);

  SetVector<Function *> Functions;
  for (LazyCallGraph::Node &N : C) {
    Function *F = &N.getFunction();
    if (!F->isIntrinsic())
      Functions.insert(F);
  }

  AMDGPUAttributorOptions Options;
  Module *M = C.begin()->getFunction().getParent();
  // In the CGSCC pipeline, avoid untracked call graph modifications by
  // disabling function deletion, mirroring the generic AttributorCGSCCPass.
  return runImpl(Functions, /*IsModulePass=*/false, /*DeleteFns=*/false, *M, AG,
                 TM, Options, ThinOrFullLTOPhase::None)
             ? PreservedAnalyses::none()
             : PreservedAnalyses::all();
}