//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Eliminates allocas by either converting them into vectors or by migrating // them to local address space. // // Two passes are exposed by this file: // - "promote-alloca-to-vector", which runs early in the pipeline and only // promotes to vector. Promotion to vector is almost always profitable // except when the alloca is too big and the promotion would result in // very high register pressure. // - "promote-alloca", which does both promotion to vector and LDS and runs // much later in the pipeline. This runs after SROA because promoting to // LDS is of course less profitable than getting rid of the alloca or // vectorizing it, thus we only want to do it when the only alternative is // lowering the alloca to stack. // // Note that both of them exist for the old and new PMs. The new PM passes are // declared in AMDGPU.h and the legacy PM ones are declared here.s // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #define DEBUG_TYPE "amdgpu-promote-alloca" using namespace llvm; namespace { static cl::opt DisablePromoteAllocaToVector("disable-promote-alloca-to-vector", cl::desc("Disable promote alloca to vector"), cl::init(false)); static cl::opt DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds", cl::desc("Disable promote alloca to LDS"), cl::init(false)); static cl::opt PromoteAllocaToVectorLimit( "amdgpu-promote-alloca-to-vector-limit", cl::desc("Maximum byte size to consider promote alloca to vector"), cl::init(0)); static cl::opt PromoteAllocaToVectorMaxRegs( "amdgpu-promote-alloca-to-vector-max-regs", cl::desc( "Maximum vector size (in 32b registers) to use when promoting alloca"), cl::init(32)); // Use up to 1/4 of available register budget for vectorization. // FIXME: Increase the limit for whole function budgets? Perhaps x2? static cl::opt PromoteAllocaToVectorVGPRRatio( "amdgpu-promote-alloca-to-vector-vgpr-ratio", cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"), cl::init(4)); static cl::opt LoopUserWeight("promote-alloca-vector-loop-user-weight", cl::desc("The bonus weight of users of allocas within loop " "when sorting profitable allocas"), cl::init(4)); // We support vector indices of the form ((A * stride) >> shift) + B // VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All // parts are optional. struct GEPToVectorIndex { WeakTrackingVH VarIndex = nullptr; // defaults to 0 ConstantInt *VarMul = nullptr; // defaults to 1 ConstantInt *VarShift = nullptr; // defaults to 0 ConstantInt *ConstIndex = nullptr; // defaults to 0 Value *Full = nullptr; }; struct MemTransferInfo { ConstantInt *SrcIndex = nullptr; ConstantInt *DestIndex = nullptr; }; // Analysis for planning the different strategies of alloca promotion. struct AllocaAnalysis { AllocaInst *Alloca = nullptr; DenseSet Pointers; SmallVector Uses; unsigned Score = 0; bool HaveSelectOrPHI = false; struct { FixedVectorType *Ty = nullptr; SmallVector Worklist; SmallVector UsersToRemove; MapVector GEPVectorIdx; MapVector TransferInfo; } Vector; struct { bool Enable = false; SmallVector Worklist; } LDS; explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {} }; // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: const TargetMachine &TM; LoopInfo &LI; Module *Mod = nullptr; const DataLayout *DL = nullptr; // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; unsigned MaxVGPRs; unsigned VGPRBudgetRatio; unsigned MaxVectorRegs; bool IsAMDGCN = false; bool IsAMDHSA = false; std::pair getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); bool collectAllocaUses(AllocaAnalysis &AA) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). /// Returns true if both operands are derived from the same alloca. Val should /// be the same value as one of the input operands of UseInst. bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, Instruction *UseInst, int OpIdx0, int OpIdx1) const; /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; void analyzePromoteToVector(AllocaAnalysis &AA) const; void promoteAllocaToVector(AllocaAnalysis &AA); void analyzePromoteToLDS(AllocaAnalysis &AA) const; bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS, SetVector &DeferredIntrs); void finishDeferredAllocaToLDSPromotion(SetVector &DeferredIntrs); void scoreAlloca(AllocaAnalysis &AA) const; void setFunctionLimits(const Function &F); public: AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) { const Triple &TT = TM.getTargetTriple(); IsAMDGCN = TT.isAMDGCN(); IsAMDHSA = TT.getOS() == Triple::AMDHSA; } bool run(Function &F, bool PromoteToLDS); }; // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { public: static char ID; AMDGPUPromoteAlloca() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; if (auto *TPC = getAnalysisIfAvailable()) return AMDGPUPromoteAllocaImpl( TPC->getTM(), getAnalysis().getLoopInfo()) .run(F, /*PromoteToLDS*/ true); return false; } StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); FunctionPass::getAnalysisUsage(AU); } }; static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM, const Function &F) { if (!TM.getTargetTriple().isAMDGCN()) return 128; const GCNSubtarget &ST = TM.getSubtarget(F); unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); // Temporarily check both the attribute and the subtarget feature, until the // latter is removed. if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled()) DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize(); unsigned MaxVGPRs = ST.getMaxNumVGPRs( ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first, DynamicVGPRBlockSize); // A non-entry function has only 32 caller preserved registers. // Do not promote alloca which will force spilling unless we know the function // will be inlined. if (!F.hasFnAttribute(Attribute::AlwaysInline) && !AMDGPU::isEntryFunctionCC(F.getCallingConv())) MaxVGPRs = std::min(MaxVGPRs, 32u); return MaxVGPRs; } } // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) // Move LDS uses from functions to kernels before promote alloca for accurate // estimation of LDS available INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult(F); bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true); if (Changed) { PreservedAnalyses PA; PA.preserveSet(); return PA; } return PreservedAnalyses::all(); } PreservedAnalyses AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult(F); bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false); if (Changed) { PreservedAnalyses PA; PA.preserveSet(); return PA; } return PreservedAnalyses::all(); } FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const { const auto RejectUser = [&](Instruction *Inst, Twine Msg) { LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n" << " " << *Inst << "\n"); return false; }; SmallVector WorkList({AA.Alloca}); while (!WorkList.empty()) { auto *Cur = WorkList.pop_back_val(); if (find(AA.Pointers, Cur) != AA.Pointers.end()) continue; AA.Pointers.insert(Cur); for (auto &U : Cur->uses()) { auto *Inst = cast(U.getUser()); if (isa(Inst)) { if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) { return RejectUser(Inst, "pointer escapes via store"); } } AA.Uses.push_back(&U); if (isa(U.getUser())) { WorkList.push_back(Inst); } else if (auto *SI = dyn_cast(Inst)) { // Only promote a select if we know that the other select operand is // from another pointer that will also be promoted. if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2)) return RejectUser(Inst, "select from mixed objects"); WorkList.push_back(Inst); AA.HaveSelectOrPHI = true; } else if (auto *Phi = dyn_cast(Inst)) { // Repeat for phis. // TODO: Handle more complex cases. We should be able to replace loops // over arrays. switch (Phi->getNumIncomingValues()) { case 1: break; case 2: if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1)) return RejectUser(Inst, "phi from mixed objects"); break; default: return RejectUser(Inst, "phi with too many operands"); } WorkList.push_back(Inst); AA.HaveSelectOrPHI = true; } } } return true; } void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const { LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n"); unsigned Score = 0; // Increment score by one for each user + a bonus for users within loops. for (auto *U : AA.Uses) { Instruction *Inst = cast(U->getUser()); if (isa(Inst) || isa(Inst) || isa(Inst)) continue; unsigned UserScore = 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); Score += UserScore; } LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); AA.Score = Score; } void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { // Load per function limits, overriding with global options where appropriate. // R600 register tuples/aliasing are fragile with large vector promotions so // apply architecture specific limit here. const int R600MaxVectorRegs = 16; MaxVectorRegs = F.getFnAttributeAsParsedInteger( "amdgpu-promote-alloca-to-vector-max-regs", IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs); if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) MaxVectorRegs = PromoteAllocaToVectorMaxRegs; VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( "amdgpu-promote-alloca-to-vector-vgpr-ratio", PromoteAllocaToVectorVGPRRatio); if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences()) VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio; } bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector) return false; Mod = F.getParent(); DL = &Mod->getDataLayout(); bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F); MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F); setFunctionLimits(F); unsigned VectorizationBudget = (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 : (MaxVGPRs * 32)) / VGPRBudgetRatio; std::vector Allocas; for (Instruction &I : F.getEntryBlock()) { if (AllocaInst *AI = dyn_cast(&I)) { // Array allocations are probably not worth handling, since an allocation // of the array type is the canonical form. if (!AI->isStaticAlloca() || AI->isArrayAllocation()) continue; LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n'); AllocaAnalysis AA{AI}; if (collectAllocaUses(AA)) { analyzePromoteToVector(AA); if (PromoteToLDS) analyzePromoteToLDS(AA); if (AA.Vector.Ty || AA.LDS.Enable) { scoreAlloca(AA); Allocas.push_back(std::move(AA)); } } } } stable_sort(Allocas, [](const auto &A, const auto &B) { return A.Score > B.Score; }); // clang-format off LLVM_DEBUG( dbgs() << "Sorted Worklist:\n"; for (const auto &AA : Allocas) dbgs() << " " << *AA.Alloca << "\n"; ); // clang-format on bool Changed = false; SetVector DeferredIntrs; for (AllocaAnalysis &AA : Allocas) { if (AA.Vector.Ty) { std::optional Size = AA.Alloca->getAllocationSize(*DL); assert(Size); // Expected to succeed on non-array alloca. const unsigned AllocaCost = Size->getFixedValue() * 8; // First, check if we have enough budget to vectorize this alloca. if (AllocaCost <= VectorizationBudget) { promoteAllocaToVector(AA); Changed = true; assert((VectorizationBudget - AllocaCost) < VectorizationBudget && "Underflow!"); VectorizationBudget -= AllocaCost; LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" << VectorizationBudget << "\n"); continue; } else { LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" << AllocaCost << ", budget:" << VectorizationBudget << "): " << *AA.Alloca << "\n"); } } if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs)) Changed = true; } finishDeferredAllocaToLDSPromotion(DeferredIntrs); // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains // dangling pointers. If we want to reuse it past this point, the loop above // would need to be updated to remove successfully promoted allocas. return Changed; } // Checks if the instruction I is a memset user of the alloca AI that we can // deal with. Currently, only non-volatile memsets that affect the whole alloca // are handled. static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL) { using namespace PatternMatch; // For now we only care about non-volatile memsets that affect the whole type // (start at index 0 and fill the whole alloca). // // TODO: Now that we moved to PromoteAlloca we could handle any memsets // (except maybe volatile ones?) - we just need to use shufflevector if it // only affects a subset of the vector. const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType()); return I->getOperand(0) == AI && match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); } static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) { IRBuilder<> B(Ptr->getContext()); Ptr = Ptr->stripPointerCasts(); if (Ptr == AA.Alloca) return B.getInt32(0); auto *GEP = cast(Ptr); auto I = AA.Vector.GEPVectorIdx.find(GEP); assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!"); if (!I->second.Full) { Value *Result = nullptr; B.SetInsertPoint(GEP); if (I->second.VarIndex) { Result = I->second.VarIndex; Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty()); if (I->second.VarMul) Result = B.CreateMul(Result, I->second.VarMul); if (I->second.VarShift) Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true); } if (I->second.ConstIndex) { if (Result) Result = B.CreateAdd(Result, I->second.ConstIndex); else Result = I->second.ConstIndex; } if (!Result) Result = B.getInt32(0); I->second.Full = Result; } return I->second.Full; } static std::optional computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL) { // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. LLVMContext &Ctx = GEP->getContext(); unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); SmallMapVector VarOffsets; APInt ConstOffset(BW, 0); // Walk backwards through nested GEPs to collect both constant and variable // offsets, so that nested vector GEP chains can be lowered in one step. // // Given this IR fragment as input: // // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5) // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4 // %3 = load i32, ptr addrspace(5) %2, align 4 // // Combine both GEP operations in a single pass, producing: // BasePtr = %0 // ConstOffset = 4 // VarOffsets = { %j -> element_size(<2 x i32>) } // // That lets us emit a single buffer_load directly into a VGPR, without ever // allocating scratch memory for the intermediate pointer. Value *CurPtr = GEP; while (auto *CurGEP = dyn_cast(CurPtr)) { if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) return {}; // Move to the next outer pointer. CurPtr = CurGEP->getPointerOperand(); } assert(CurPtr == Alloca && "GEP not based on alloca"); int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy); if (VarOffsets.size() > 1) return {}; // We support vector indices of the form ((VarIndex * stride) >> shift) + B. // IndexQuot represents B. Check that the constant offset is a multiple // of the vector element size. if (ConstOffset.srem(VecElemSize) != 0) return {}; APInt IndexQuot = ConstOffset.sdiv(VecElemSize); GEPToVectorIndex Result; if (!ConstOffset.isZero()) Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); // If there are no variable offsets, only a constant offset, then we're done. if (VarOffsets.empty()) return Result; // Scale is the stride in the (A * stride) part. Check that there is only one // variable offset and extract the scale factor. const auto &VarOffset = VarOffsets.front(); auto ScaleOpt = VarOffset.second.tryZExtValue(); if (!ScaleOpt || *ScaleOpt == 0) return {}; uint64_t Scale = *ScaleOpt; Result.VarIndex = VarOffset.first; auto *OffsetType = dyn_cast(Result.VarIndex->getType()); if (!OffsetType) return {}; // The vector index for the variable part is: VarIndex * Scale / VecElemSize. if (Scale >= (uint64_t)VecElemSize) { if (Scale % VecElemSize != 0) return {}; // Scale is a multiple of VecElemSize, so the index is just: VarIndex * // (Scale / VecElemSize). uint64_t VarMul = Scale / VecElemSize; // Only the multiplier is needed. if (VarMul != 1) Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul)); } else { if ((uint64_t)VecElemSize % Scale != 0) return {}; // VecElemSize is a multiple of Scale, so the index is just: VarIndex / // (VecElemSize / Scale). uint64_t Divisor = VecElemSize / Scale; // The divisor must be a power of 2 so we can use a right shift. if (!isPowerOf2_64(Divisor)) return {}; // VarIndex must be known to be divisible by that divisor. KnownBits KB = computeKnownBits(VarOffset.first, DL); if (KB.countMinTrailingZeros() < Log2_64(Divisor)) return {}; Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor))); } return Result; } /// Promotes a single user of the alloca to a vector form. /// /// \param Inst Instruction to be promoted. /// \param DL Module Data Layout. /// \param AA Alloca Analysis. /// \param VecStoreSize Size of \p VectorTy in bytes. /// \param ElementSize Size of \p VectorTy element type in bytes. /// \param CurVal Current value of the vector (e.g. last stored value) /// \param[out] DeferredLoads \p Inst is added to this vector if it can't /// be promoted now. This happens when promoting requires \p /// CurVal, but \p CurVal is nullptr. /// \return the stored value if \p Inst would have written to the alloca, or /// nullptr otherwise. static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); Type *VecEltTy = AA.Vector.Ty->getElementType(); switch (Inst->getOpcode()) { case Instruction::Load: { Value *CurVal = GetCurVal(); Value *Index = calculateVectorIndex(cast(Inst)->getPointerOperand(), AA); // We're loading the full vector. Type *AccessTy = Inst->getType(); TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); if (Constant *CI = dyn_cast(Index)) { if (CI->isNullValue() && AccessSize == VecStoreSize) { Inst->replaceAllUsesWith( Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy)); return nullptr; } } // Loading a subvector. if (isa(AccessTy)) { assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); // If idx is dynamic, then sandwich load with bitcasts. // ie. VectorTy SubVecTy AccessTy // <64 x i8> -> <16 x i8> <8 x i16> // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16> // Extracting subvector with dynamic index has very large expansion in // the amdgpu backend. Limit to pow2. FixedVectorType *VectorTy = AA.Vector.Ty; TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u; uint64_t LoadAlign = cast(Inst)->getAlign().value(); bool IsAlignedLoad = NumBits <= (LoadAlign * 8u); unsigned TotalNumElts = VectorTy->getNumElements(); bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0; if (!isa(Index) && llvm::isPowerOf2_32(SubVecTy->getNumElements()) && IsProperlyDivisible && IsAlignedLoad) { IntegerType *NewElemTy = Builder.getIntNTy(NumBits); const unsigned NewNumElts = DL.getTypeStoreSize(VectorTy) * 8u / NumBits; const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements()); FixedVectorType *BitCastTy = FixedVectorType::get(NewElemTy, NewNumElts); Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy); Value *NewIdx = Builder.CreateLShr( Index, ConstantInt::get(Index->getType(), LShrAmt)); Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx); Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy); Inst->replaceAllUsesWith(BCOut); return nullptr; } Value *SubVec = PoisonValue::get(SubVecTy); for (unsigned K = 0; K < NumLoadedElts; ++K) { Value *CurIdx = Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K)); SubVec = Builder.CreateInsertElement( SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K); } Inst->replaceAllUsesWith( Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy)); return nullptr; } // We're loading one element. Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index); if (AccessTy != VecEltTy) ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy); Inst->replaceAllUsesWith(ExtractElement); return nullptr; } case Instruction::Store: { // For stores, it's a bit trickier and it depends on whether we're storing // the full vector or not. If we're storing the full vector, we don't need // to know the current value. If this is a store of a single element, we // need to know the value. StoreInst *SI = cast(Inst); Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA); Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. Type *AccessTy = Val->getType(); TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); if (Constant *CI = dyn_cast(Index)) if (CI->isNullValue() && AccessSize == VecStoreSize) return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty); // Storing a subvector. if (isa(AccessTy)) { assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumWrittenElts = AccessSize / DL.getTypeStoreSize(VecEltTy); const unsigned NumVecElts = AA.Vector.Ty->getNumElements(); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy); Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K)); CurVec = Builder.CreateInsertElement( CurVec, Builder.CreateExtractElement(Val, K), CurIdx); } return CurVec; } if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast(Inst)) { // For memcpy, we need to know curval. ConstantInt *Length = cast(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI]; unsigned SrcBegin = TI->SrcIndex->getZExtValue(); unsigned DestBegin = TI->DestIndex->getZExtValue(); SmallVector Mask; for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) { if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements() ? SrcBegin++ : PoisonMaskElem); } else { Mask.push_back(Idx); } } return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast(Inst)) { // For memset, we don't need to know the previous value because we // currently only allow memsets that cover the whole alloca. Value *Elt = MSI->getOperand(1); const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy); if (BytesPerElt > 1) { Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt); // If the element type of the vector is a pointer, we need to first cast // to an integer, then use a PtrCast. if (VecEltTy->isPointerTy()) { Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8); Elt = Builder.CreateBitCast(EltBytes, PtrInt); Elt = Builder.CreateIntToPtr(Elt, VecEltTy); } else Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt); } if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { Intr->replaceAllUsesWith( Builder.getIntN(Intr->getType()->getIntegerBitWidth(), DL.getTypeAllocSize(AA.Vector.Ty))); return nullptr; } } llvm_unreachable("Unsupported call when promoting alloca to vector"); } default: llvm_unreachable("Inconsistency in instructions promotable to vector"); } llvm_unreachable("Did not return after promoting instruction!"); } static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL) { // Access as a vector type can work if the size of the access vector is a // multiple of the size of the alloca's vector element type. // // Examples: // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK // - 3*32 is not a multiple of 64 // // We could handle more complicated cases, but it'd make things a lot more // complicated. if (isa(AccessTy)) { TypeSize AccTS = DL.getTypeStoreSize(AccessTy); // If the type size and the store size don't match, we would need to do more // than just bitcast to translate between an extracted/insertable subvectors // and the accessed value. if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy)) return false; TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType()); return AccTS.isKnownMultipleOf(VecTS); } return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy, DL); } /// Iterates over an instruction worklist that may contain multiple instructions /// from the same basic block, but in a different order. template static void forEachWorkListItem(const InstContainer &WorkList, std::function Fn) { // Bucket up uses of the alloca by the block they occur in. // This is important because we have to handle multiple defs/uses in a block // ourselves: SSAUpdater is purely for cross-block references. DenseMap> UsesByBlock; for (Instruction *User : WorkList) UsesByBlock[User->getParent()].insert(User); for (Instruction *User : WorkList) { BasicBlock *BB = User->getParent(); auto &BlockUses = UsesByBlock[BB]; // Already processed, skip. if (BlockUses.empty()) continue; // Only user in the block, directly process it. if (BlockUses.size() == 1) { Fn(User); continue; } // Multiple users in the block, do a linear scan to see users in order. for (Instruction &Inst : *BB) { if (!BlockUses.contains(&Inst)) continue; Fn(&Inst); } // Clear the block so we know it's been processed. BlockUses.clear(); } } /// Find an insert point after an alloca, after all other allocas clustered at /// the start of the block. static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I) { for (BasicBlock::iterator E = BB.end(); I != E && isa(*I); ++I) ; return I; } FixedVectorType * AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n"); return nullptr; } auto *VectorTy = dyn_cast(AllocaTy); if (auto *ArrayTy = dyn_cast(AllocaTy)) { uint64_t NumElems = 1; Type *ElemTy; do { NumElems *= ArrayTy->getNumElements(); ElemTy = ArrayTy->getElementType(); } while ((ArrayTy = dyn_cast(ElemTy))); // Check for array of vectors auto *InnerVectorTy = dyn_cast(ElemTy); if (InnerVectorTy) { NumElems *= InnerVectorTy->getNumElements(); ElemTy = InnerVectorTy->getElementType(); } if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8; if (ElementSize > 0) { unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); // Expand vector if required to match padding of inner type, // i.e. odd size subvectors. // Storage size of new vector must match that of alloca for correct // behaviour of byte offsets and GEP computation. if (NumElems * ElementSize != AllocaSize) NumElems = AllocaSize / ElementSize; if (NumElems > 0 && (AllocaSize % ElementSize) == 0) VectorTy = FixedVectorType::get(ElemTy, NumElems); } } } if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return nullptr; } const unsigned MaxElements = (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType()); if (VectorTy->getNumElements() > MaxElements || VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); return nullptr; } Type *VecEltTy = VectorTy->getElementType(); unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " "does not match the type's size\n"); return nullptr; } return VectorTy; } void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const { if (AA.HaveSelectOrPHI) { LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n"); return; } Type *AllocaTy = AA.Alloca->getAllocatedType(); AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy); if (!AA.Vector.Ty) return; const auto RejectUser = [&](Instruction *Inst, Twine Msg) { LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" << " " << *Inst << "\n"); AA.Vector.Ty = nullptr; }; Type *VecEltTy = AA.Vector.Ty->getElementType(); unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); for (auto *U : AA.Uses) { Instruction *Inst = cast(U->getUser()); if (Value *Ptr = getLoadStorePointerOperand(Inst)) { assert(!isa(Inst) || U->getOperandNo() == StoreInst::getPointerOperandIndex()); Type *AccessTy = getLoadStoreType(Inst); if (AccessTy->isAggregateType()) return RejectUser(Inst, "unsupported load/store as aggregate"); assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); // Check that this is a simple access of a vector element. bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() : cast(Inst)->isSimple(); if (!IsSimple) return RejectUser(Inst, "not a simple load or store"); Ptr = Ptr->stripPointerCasts(); // Alloca already accessed as vector. if (Ptr == AA.Alloca && DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) == DL->getTypeStoreSize(AccessTy)) { AA.Vector.Worklist.push_back(Inst); continue; } if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL)) return RejectUser(Inst, "not a supported access type"); AA.Vector.Worklist.push_back(Inst); continue; } if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value()); AA.Vector.UsersToRemove.push_back(Inst); continue; } if (MemSetInst *MSI = dyn_cast(Inst); MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) { AA.Vector.Worklist.push_back(Inst); continue; } if (MemTransferInst *TransferInst = dyn_cast(Inst)) { if (TransferInst->isVolatile()) return RejectUser(Inst, "mem transfer inst is volatile"); ConstantInt *Len = dyn_cast(TransferInst->getLength()); if (!Len || (Len->getZExtValue() % ElementSize)) return RejectUser(Inst, "mem transfer inst length is non-constant or " "not a multiple of the vector element size"); auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * { if (Ptr == AA.Alloca) return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); GetElementPtrInst *GEP = cast(Ptr); const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second; if (GEPI.VarIndex) return nullptr; if (GEPI.ConstIndex) return GEPI.ConstIndex; return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); }; MemTransferInfo *TI = &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second; unsigned OpNum = U->getOperandNo(); if (OpNum == 0) { Value *Dest = TransferInst->getDest(); ConstantInt *Index = getConstIndexIntoAlloca(Dest); if (!Index) return RejectUser(Inst, "could not calculate constant dest index"); TI->DestIndex = Index; } else { assert(OpNum == 1); Value *Src = TransferInst->getSource(); ConstantInt *Index = getConstIndexIntoAlloca(Src); if (!Index) return RejectUser(Inst, "could not calculate constant src index"); TI->SrcIndex = Index; } continue; } if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { AA.Vector.Worklist.push_back(Inst); continue; } } // Ignore assume-like intrinsics and comparisons used in assumes. if (isAssumeLikeIntrinsic(Inst)) { if (!Inst->use_empty()) return RejectUser(Inst, "assume-like intrinsic cannot have any users"); AA.Vector.UsersToRemove.push_back(Inst); continue; } if (isa(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast(U)); })) { AA.Vector.UsersToRemove.push_back(Inst); continue; } return RejectUser(Inst, "unhandled alloca user"); } // Follow-up check to ensure we've seen both sides of all transfer insts. for (const auto &Entry : AA.Vector.TransferInfo) { const MemTransferInfo &TI = Entry.second; if (!TI.SrcIndex || !TI.DestIndex) return RejectUser(Entry.first, "mem transfer inst between different objects"); AA.Vector.Worklist.push_back(Entry.first); } } void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) { LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n'); LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType() << " -> " << *AA.Vector.Ty << '\n'); const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty); Type *VecEltTy = AA.Vector.Ty->getElementType(); const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; // Alloca is uninitialized memory. Imitate that by making the first value // undef. SSAUpdater Updater; Updater.Initialize(AA.Vector.Ty, "promotealloca"); BasicBlock *EntryBB = AA.Alloca->getParent(); BasicBlock::iterator InitInsertPos = skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator()); IRBuilder<> Builder(&*InitInsertPos); Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty)); AllocaInitValue->takeName(AA.Alloca); Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue); // First handle the initial worklist, in basic block order. // // Insert a placeholder whenever we need the vector value at the top of a // basic block. SmallSetVector Placeholders; forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) { BasicBlock *BB = I->getParent(); auto GetCurVal = [&]() -> Value * { if (Value *CurVal = Updater.FindValueForBlock(BB)) return CurVal; if (!Placeholders.empty() && Placeholders.back()->getParent() == BB) return Placeholders.back(); // If the current value in the basic block is not yet known, insert a // placeholder that we will replace later. IRBuilder<> Builder(I); auto *Placeholder = cast(Builder.CreateFreeze( PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder")); Placeholders.insert(Placeholder); return Placeholders.back(); }; Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize, ElementSize, GetCurVal); // If the returned result is a placeholder, it means the instruction does // not really modify the alloca. So no need to make it being available value // to SSAUpdater. // This will stop placeholder being cached in SSAUpdater. The cached // placeholder may cause stale pointer being referenced when doing // placeholder replacement. if (Result && (!isa(Result) || !Placeholders.contains(cast(Result)))) Updater.AddAvailableValue(BB, Result); }); // Now fixup the placeholders. for (Instruction *Placeholder : Placeholders) { Placeholder->replaceAllUsesWith( Updater.GetValueInMiddleOfBlock(Placeholder->getParent())); Placeholder->eraseFromParent(); } // Delete all instructions. for (Instruction *I : AA.Vector.Worklist) { assert(I->use_empty()); I->eraseFromParent(); } // Delete all the users that are known to be removeable. for (Instruction *I : reverse(AA.Vector.UsersToRemove)) { I->dropDroppableUses(); assert(I->use_empty()); I->eraseFromParent(); } // Alloca should now be dead too. assert(AA.Alloca->use_empty()); AA.Alloca->eraseFromParent(); } std::pair AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { Function &F = *Builder.GetInsertBlock()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { CallInst *LocalSizeY = Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {}); CallInst *LocalSizeZ = Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {}); ST.makeLIDRangeMetadata(LocalSizeY); ST.makeLIDRangeMetadata(LocalSizeZ); return std::pair(LocalSizeY, LocalSizeZ); } // We must read the size out of the dispatch pointer. assert(IsAMDGCN); // We are indexing into this struct, and want to extract the workgroup_size_* // fields. // // typedef struct hsa_kernel_dispatch_packet_s { // uint16_t header; // uint16_t setup; // uint16_t workgroup_size_x ; // uint16_t workgroup_size_y; // uint16_t workgroup_size_z; // uint16_t reserved0; // uint32_t grid_size_x ; // uint32_t grid_size_y ; // uint32_t grid_size_z; // // uint32_t private_segment_size; // uint32_t group_segment_size; // uint64_t kernel_object; // // #ifdef HSA_LARGE_MODEL // void *kernarg_address; // #elif defined HSA_LITTLE_ENDIAN // void *kernarg_address; // uint32_t reserved1; // #else // uint32_t reserved1; // void *kernarg_address; // #endif // uint64_t reserved2; // hsa_signal_t completion_signal; // uint64_t wrapper // } hsa_kernel_dispatch_packet_t // CallInst *DispatchPtr = Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}); DispatchPtr->addRetAttr(Attribute::NoAlias); DispatchPtr->addRetAttr(Attribute::NonNull); F.removeFnAttr("amdgpu-no-dispatch-ptr"); // Size of the dispatch packet struct. DispatchPtr->addDereferenceableRetAttr(64); Type *I32Ty = Type::getInt32Ty(Mod->getContext()); // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergeable later anyway. Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1); LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2); LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); MDNode *MD = MDNode::get(Mod->getContext(), {}); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); ST.makeLIDRangeMetadata(LoadZU); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); return std::pair(Y, LoadZU); } Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, unsigned N) { Function *F = Builder.GetInsertBlock()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F); Intrinsic::ID IntrID = Intrinsic::not_intrinsic; StringRef AttrName; switch (N) { case 0: IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; AttrName = "amdgpu-no-workitem-id-x"; break; case 1: IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; AttrName = "amdgpu-no-workitem-id-y"; break; case 2: IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; AttrName = "amdgpu-no-workitem-id-z"; break; default: llvm_unreachable("invalid dimension"); } Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); ST.makeLIDRangeMetadata(CI); F->removeFnAttr(AttrName); return CI; } static bool isCallPromotable(CallInst *CI) { IntrinsicInst *II = dyn_cast(CI); if (!II) return false; switch (II->getIntrinsicID()) { case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: case Intrinsic::objectsize: return true; default: return false; } } bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, int OpIdx1) const { // Figure out which operand is the one we might not be promoting. Value *OtherOp = Inst->getOperand(OpIdx0); if (Val == OtherOp) OtherOp = Inst->getOperand(OpIdx1); if (isa(OtherOp)) return true; // TODO: getUnderlyingObject will not work on a vector getelementptr Value *OtherObj = getUnderlyingObject(OtherOp); if (!isa(OtherObj)) return false; // TODO: We should be able to replace undefs with the right pointer type. // TODO: If we know the other base object is another promotable // alloca, not necessarily this alloca, we can do this. The // important part is both must have the same address space at // the end. if (OtherObj != BaseAlloca) { LLVM_DEBUG( dbgs() << "Found a binary instruction with another alloca object\n"); return false; } return true; } void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const { if (DisablePromoteAllocaToLDS) { LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); return; } // Don't promote the alloca to LDS for shader calling conventions as the work // item ID intrinsics are not supported for these calling conventions. // Furthermore not all LDS is available for some of the stages. const Function &ContainingFunction = *AA.Alloca->getFunction(); CallingConv::ID CC = ContainingFunction.getCallingConv(); switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: break; default: LLVM_DEBUG( dbgs() << " promote alloca to LDS not supported with calling convention.\n"); return; } for (Use *Use : AA.Uses) { auto *User = Use->getUser(); if (CallInst *CI = dyn_cast(User)) { if (!isCallPromotable(CI)) return; if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) AA.LDS.Worklist.push_back(User); continue; } Instruction *UseInst = cast(User); if (UseInst->getOpcode() == Instruction::PtrToInt) return; if (LoadInst *LI = dyn_cast(UseInst)) { if (LI->isVolatile()) return; continue; } if (StoreInst *SI = dyn_cast(UseInst)) { if (SI->isVolatile()) return; continue; } if (AtomicRMWInst *RMW = dyn_cast(UseInst)) { if (RMW->isVolatile()) return; continue; } if (AtomicCmpXchgInst *CAS = dyn_cast(UseInst)) { if (CAS->isVolatile()) return; continue; } // Only promote a select if we know that the other select operand // is from another pointer that will also be promoted. if (ICmpInst *ICmp = dyn_cast(UseInst)) { if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1)) return; // May need to rewrite constant operands. if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) AA.LDS.Worklist.push_back(ICmp); continue; } if (GetElementPtrInst *GEP = dyn_cast(UseInst)) { // Be conservative if an address could be computed outside the bounds of // the alloca. if (!GEP->isInBounds()) return; } else if (!isa(User)) { // Do not promote vector/aggregate type instructions. It is hard to track // their users. // Do not promote addrspacecast. // // TODO: If we know the address is only observed through flat pointers, we // could still promote. return; } if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) AA.LDS.Worklist.push_back(User); } AA.LDS.Enable = true; } bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. for (Type *ParamTy : FTy->params()) { PointerType *PtrTy = dyn_cast(ParamTy); if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemLimit = 0; LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); return false; } } LocalMemLimit = ST.getAddressableLocalMemorySize(); if (LocalMemLimit == 0) return false; SmallVector Stack; SmallPtrSet VisitedConstants; SmallPtrSet UsedLDS; auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { for (const User *U : Val->users()) { if (const Instruction *Use = dyn_cast(U)) { if (Use->getFunction() == &F) return true; } else { const Constant *C = cast(U); if (VisitedConstants.insert(C).second) Stack.push_back(C); } } return false; }; for (GlobalVariable &GV : Mod->globals()) { if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; if (visitUsers(&GV, &GV)) { UsedLDS.insert(&GV); Stack.clear(); continue; } // For any ConstantExpr uses, we need to recursively search the users until // we see a function. while (!Stack.empty()) { const Constant *C = Stack.pop_back_val(); if (visitUsers(&GV, C)) { UsedLDS.insert(&GV); Stack.clear(); break; } } } const DataLayout &DL = Mod->getDataLayout(); SmallVector, 16> AllocatedSizes; AllocatedSizes.reserve(UsedLDS.size()); for (const GlobalVariable *GV : UsedLDS) { Align Alignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); uint64_t AllocSize = GV->getGlobalSize(DL); // HIP uses an extern unsized array in local address space for dynamically // allocated shared memory. In that case, we have to disable the promotion. if (GV->hasExternalLinkage() && AllocSize == 0) { LocalMemLimit = 0; LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " "local memory. Promoting to local memory " "disabled.\n"); return false; } AllocatedSizes.emplace_back(AllocSize, Alignment); } // Sort to try to estimate the worst case alignment padding // // FIXME: We should really do something to fix the addresses to a more optimal // value instead llvm::sort(AllocatedSizes, llvm::less_second()); // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; // FIXME: Try to account for padding here. The real padding and address is // currently determined from the inverse order of uses in the function when // legalizing, which could also potentially change. We try to estimate the // worst case here, but we probably should fix the addresses earlier. for (auto Alloc : AllocatedSizes) { CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); CurrentLocalMemUsage += Alloc.first; } unsigned MaxOccupancy = ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F) .second; // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); // Program may already use more LDS than is usable at maximum occupancy. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) return false; LocalMemLimit = MaxSizeWithWaveCount; LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" << " Rounding size to " << MaxSizeWithWaveCount << " with a maximum occupancy of " << MaxOccupancy << '\n' << " and " << (LocalMemLimit - CurrentLocalMemUsage) << " available for promotion\n"); return true; } // FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS( AllocaAnalysis &AA, bool SufficientLDS, SetVector &DeferredIntrs) { LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n'); // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(AA.Alloca); const Function &ContainingFunction = *AA.Alloca->getParent()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; Align Alignment = AA.Alloca->getAlign(); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory // could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); std::optional ElemSize = AA.Alloca->getAllocationSize(DL); if (!ElemSize || ElemSize->isScalable()) return false; TypeSize AllocSize = WorkGroupSize * *ElemSize; NewSize += AllocSize.getFixedValue(); if (NewSize > LocalMemLimit) { LLVM_DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); return false; } CurrentLocalMemUsage = NewSize; LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); Function *F = AA.Alloca->getFunction(); Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy), Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(AA.Alloca->getAlign()); Value *TCntY, *TCntZ; std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); Value *TIdX = getWorkitemID(Builder, 0); Value *TIdY = getWorkitemID(Builder, 1); Value *TIdZ = getWorkitemID(Builder, 2); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); LLVMContext &Context = Mod->getContext(); Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID}; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); AA.Alloca->mutateType(Offset->getType()); AA.Alloca->replaceAllUsesWith(Offset); AA.Alloca->eraseFromParent(); PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); for (Value *V : AA.LDS.Worklist) { CallInst *Call = dyn_cast(V); if (!Call) { if (ICmpInst *CI = dyn_cast(V)) { Value *LHS = CI->getOperand(0); Value *RHS = CI->getOperand(1); Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy); if (isa(LHS)) CI->setOperand(0, Constant::getNullValue(NewTy)); if (isa(RHS)) CI->setOperand(1, Constant::getNullValue(NewTy)); continue; } // The operand's value should be corrected on its own and we don't want to // touch the users. if (isa(V)) continue; assert(V->getType()->isPtrOrPtrVectorTy()); Type *NewTy = V->getType()->getWithNewType(NewPtrTy); V->mutateType(NewTy); // Adjust the types of any constant operands. if (SelectInst *SI = dyn_cast(V)) { if (isa(SI->getOperand(1))) SI->setOperand(1, Constant::getNullValue(NewTy)); if (isa(SI->getOperand(2))) SI->setOperand(2, Constant::getNullValue(NewTy)); } else if (PHINode *Phi = dyn_cast(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { if (isa( Phi->getIncomingValue(I))) Phi->setIncomingValue(I, Constant::getNullValue(NewTy)); } } continue; } IntrinsicInst *Intr = cast(Call); Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: case Intrinsic::memmove: // These have 2 pointer operands. In case if second pointer also needs // to be replaced we defer processing of these intrinsics until all // other values are processed. DeferredIntrs.insert(Intr); continue; case Intrinsic::memset: { MemSetInst *MemSet = cast(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getDestAlign(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: { SmallVector Args; if (Intr->getIntrinsicID() == Intrinsic::invariant_start) { Args.emplace_back(Intr->getArgOperand(0)); } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) { Args.emplace_back(Intr->getArgOperand(0)); Args.emplace_back(Intr->getArgOperand(1)); } Args.emplace_back(Offset); Function *F = Intrinsic::getOrInsertDeclaration( Intr->getModule(), Intr->getIntrinsicID(), Offset->getType()); CallInst *NewIntr = CallInst::Create(F, Args, Intr->getName(), Intr->getIterator()); Intr->mutateType(NewIntr->getType()); Intr->replaceAllUsesWith(NewIntr); Intr->eraseFromParent(); continue; } case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); CallInst *NewCall = Builder.CreateIntrinsic( Intrinsic::objectsize, {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}, {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: Intr->print(errs()); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } return true; } void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion( SetVector &DeferredIntrs) { for (IntrinsicInst *Intr : DeferredIntrs) { IRBuilder<> Builder(Intr); Builder.SetInsertPoint(Intr); Intrinsic::ID ID = Intr->getIntrinsicID(); assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); MemTransferInst *MI = cast(Intr); auto *B = Builder.CreateMemTransferInst( ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(), MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); for (unsigned I = 0; I != 2; ++I) { if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) { B->addDereferenceableParamAttr(I, Bytes); } } Intr->eraseFromParent(); } }