Extend copyMetadata to every call-to-call replacement in AMDGPULowerIntrinsics, not just the single-wave s_barrier → wave_barrier path. This covers: - s_cluster_barrier → wave_barrier (single-wave) - s_cluster_barrier → signal_isfirst + wait + signal + wait (multi-wave) - s_barrier → signal + wait (split barriers) Add GFX11 and GFX12 RUN lines and test functions for all lowering paths to verify metadata preservation. Made-with: Cursor
220 lines
7.2 KiB
C++
220 lines
7.2 KiB
C++
//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Lower intrinsics that would otherwise require separate handling in both
|
|
// SelectionDAG and GlobalISel.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class AMDGPULowerIntrinsicsImpl {
|
|
public:
|
|
Module &M;
|
|
const AMDGPUTargetMachine &TM;
|
|
|
|
AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
|
|
: M(M), TM(TM) {}
|
|
|
|
bool run();
|
|
|
|
private:
|
|
bool visitBarrier(IntrinsicInst &I);
|
|
};
|
|
|
|
class AMDGPULowerIntrinsicsLegacy : public ModulePass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
|
|
|
|
bool runOnModule(Module &M) override;
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<TargetPassConfig>();
|
|
}
|
|
};
|
|
|
|
template <class T> static void forEachCall(Function &Intrin, T Callback) {
|
|
for (User *U : make_early_inc_range(Intrin.users())) {
|
|
if (auto *CI = dyn_cast<IntrinsicInst>(U))
|
|
Callback(CI);
|
|
}
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
bool AMDGPULowerIntrinsicsImpl::run() {
|
|
bool Changed = false;
|
|
|
|
for (Function &F : M) {
|
|
switch (F.getIntrinsicID()) {
|
|
default:
|
|
continue;
|
|
case Intrinsic::amdgcn_s_barrier:
|
|
case Intrinsic::amdgcn_s_barrier_signal:
|
|
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
|
|
case Intrinsic::amdgcn_s_barrier_wait:
|
|
case Intrinsic::amdgcn_s_cluster_barrier:
|
|
forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
|
|
break;
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
|
|
// barrier intrinsics.
|
|
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
|
|
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
|
|
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
|
|
bool IsSingleWaveWG = false;
|
|
|
|
if (TM.getOptLevel() > CodeGenOptLevel::None)
|
|
IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction());
|
|
|
|
IRBuilder<> B(&I);
|
|
|
|
// Lower the s_cluster_barrier intrinsic first. There is no corresponding
|
|
// hardware instruction in any subtarget.
|
|
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
|
|
// The default cluster barrier expects one signal per workgroup. So we need
|
|
// a workgroup barrier first.
|
|
if (IsSingleWaveWG) {
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {})
|
|
->copyMetadata(I);
|
|
} else {
|
|
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
|
|
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
|
|
CallInst *IsFirst = B.CreateIntrinsic(
|
|
B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
|
|
{BarrierID_32});
|
|
IsFirst->copyMetadata(I);
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
|
|
{BarrierID_16})
|
|
->copyMetadata(I);
|
|
|
|
Instruction *ThenTerm =
|
|
SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
|
|
B.SetInsertPoint(ThenTerm);
|
|
}
|
|
|
|
// Now we can signal the cluster barrier from a single wave and wait for the
|
|
// barrier in all waves.
|
|
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
|
|
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
|
|
{BarrierID_32})
|
|
->copyMetadata(I);
|
|
|
|
B.SetInsertPoint(&I);
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
|
|
{BarrierID_16})
|
|
->copyMetadata(I);
|
|
|
|
I.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool IsWorkgroupScope = false;
|
|
|
|
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
|
|
int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
|
|
if (BarrierID == AMDGPU::Barrier::TRAP ||
|
|
BarrierID == AMDGPU::Barrier::WORKGROUP ||
|
|
(BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
|
|
BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
|
|
IsWorkgroupScope = true;
|
|
} else {
|
|
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
|
|
IsWorkgroupScope = true;
|
|
}
|
|
|
|
if (IsWorkgroupScope && IsSingleWaveWG) {
|
|
// Down-grade waits, remove split signals.
|
|
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
|
|
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {})
|
|
->copyMetadata(I);
|
|
} else if (I.getIntrinsicID() ==
|
|
Intrinsic::amdgcn_s_barrier_signal_isfirst) {
|
|
// If we're the only wave of the workgroup, we're always first.
|
|
I.replaceAllUsesWith(B.getInt1(true));
|
|
}
|
|
I.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
|
|
ST.hasSplitBarriers()) {
|
|
// Lower to split barriers.
|
|
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
|
|
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
|
|
{BarrierID_32})
|
|
->copyMetadata(I);
|
|
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
|
|
{BarrierID_16})
|
|
->copyMetadata(I);
|
|
I.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
|
|
ModuleAnalysisManager &MAM) {
|
|
AMDGPULowerIntrinsicsImpl Impl(M, TM);
|
|
if (!Impl.run())
|
|
return PreservedAnalyses::all();
|
|
return PreservedAnalyses::none();
|
|
}
|
|
|
|
bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
|
|
auto &TPC = getAnalysis<TargetPassConfig>();
|
|
const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
|
|
|
|
AMDGPULowerIntrinsicsImpl Impl(M, TM);
|
|
return Impl.run();
|
|
}
|
|
|
|
#define PASS_DESC "AMDGPU lower intrinsics"
|
|
INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
|
|
false)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
|
INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
|
|
false)
|
|
|
|
char AMDGPULowerIntrinsicsLegacy::ID = 0;
|
|
|
|
ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
|
|
return new AMDGPULowerIntrinsicsLegacy;
|
|
}
|