Reapply "[AMDGPU] Propagate alias information in AMDGPULowerKernelArguments." (#174977)
Emit `!noalias` and `!alias.scope` metadata for `noalias` kernel arguments. Fixes sanitizer issues in #161375. --------- Co-authored-by: Leon Clark <leoclark@amd.com>
This commit is contained in:
@@ -12,14 +12,26 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUAsanInstrumentation.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/CaptureTracking.h"
|
||||
#include "llvm/Analysis/ScopedNoAliasAA.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/IR/Argument.h"
|
||||
#include "llvm/IR/Attributes.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InstIterator.h"
|
||||
#include "llvm/IR/Instruction.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/MDBuilder.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
|
||||
|
||||
@@ -37,6 +49,7 @@ public:
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.setPreservesAll();
|
||||
}
|
||||
};
|
||||
@@ -58,7 +71,125 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
|
||||
return InsPt;
|
||||
}
|
||||
|
||||
static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
|
||||
static void addAliasScopeMetadata(Function &F, const DataLayout &DL,
|
||||
DominatorTree &DT) {
|
||||
// Collect noalias arguments.
|
||||
SmallVector<const Argument *, 4u> NoAliasArgs;
|
||||
|
||||
for (Argument &Arg : F.args())
|
||||
if (Arg.hasNoAliasAttr() && !Arg.use_empty())
|
||||
NoAliasArgs.push_back(&Arg);
|
||||
|
||||
if (NoAliasArgs.empty())
|
||||
return;
|
||||
|
||||
// Add alias scopes for each noalias argument.
|
||||
MDBuilder MDB(F.getContext());
|
||||
DenseMap<const Argument *, MDNode *> NewScopes;
|
||||
MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName());
|
||||
|
||||
for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) {
|
||||
const Argument *Arg = NoAliasArgs[I];
|
||||
MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Arg->getName());
|
||||
NewScopes.insert({Arg, NewScope});
|
||||
}
|
||||
|
||||
// Iterate over all instructions.
|
||||
for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F);
|
||||
Inst != InstEnd; ++Inst) {
|
||||
// If instruction accesses memory, collect its pointer arguments.
|
||||
Instruction *I = &(*Inst);
|
||||
SmallVector<const Value *, 2u> PtrArgs;
|
||||
|
||||
if (std::optional<MemoryLocation> MO = MemoryLocation::getOrNone(I))
|
||||
PtrArgs.push_back(MO->Ptr);
|
||||
else if (const CallBase *Call = dyn_cast<CallBase>(I)) {
|
||||
if (Call->doesNotAccessMemory())
|
||||
continue;
|
||||
|
||||
for (Value *Arg : Call->args()) {
|
||||
if (!Arg->getType()->isPointerTy())
|
||||
continue;
|
||||
|
||||
PtrArgs.push_back(Arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (PtrArgs.empty())
|
||||
continue;
|
||||
|
||||
// Collect underlying objects of pointer arguments.
|
||||
SmallVector<Metadata *, 4u> Scopes;
|
||||
SmallPtrSet<const Value *, 4u> ObjSet;
|
||||
SmallVector<Metadata *, 4u> NoAliases;
|
||||
|
||||
for (const Value *Val : PtrArgs) {
|
||||
SmallVector<const Value *, 4u> Objects;
|
||||
getUnderlyingObjects(Val, Objects);
|
||||
ObjSet.insert_range(Objects);
|
||||
}
|
||||
|
||||
bool RequiresNoCaptureBefore = false;
|
||||
bool UsesUnknownObject = false;
|
||||
bool UsesAliasingPtr = false;
|
||||
|
||||
for (const Value *Val : ObjSet) {
|
||||
if (isa<ConstantData>(Val))
|
||||
continue;
|
||||
|
||||
if (const Argument *Arg = dyn_cast<Argument>(Val)) {
|
||||
if (!Arg->hasAttribute(Attribute::NoAlias))
|
||||
UsesAliasingPtr = true;
|
||||
} else
|
||||
UsesAliasingPtr = true;
|
||||
|
||||
if (isEscapeSource(Val))
|
||||
RequiresNoCaptureBefore = true;
|
||||
else if (!isa<Argument>(Val) && isIdentifiedObject(Val))
|
||||
UsesUnknownObject = true;
|
||||
}
|
||||
|
||||
if (UsesUnknownObject)
|
||||
continue;
|
||||
|
||||
// Collect noalias scopes for instruction.
|
||||
for (const Argument *Arg : NoAliasArgs) {
|
||||
if (ObjSet.contains(Arg))
|
||||
continue;
|
||||
|
||||
if (!RequiresNoCaptureBefore ||
|
||||
!capturesAnything(PointerMayBeCapturedBefore(
|
||||
Arg, false, I, &DT, false, CaptureComponents::Provenance)))
|
||||
NoAliases.push_back(NewScopes[Arg]);
|
||||
}
|
||||
|
||||
// Add noalias metadata to instruction.
|
||||
if (!NoAliases.empty()) {
|
||||
MDNode *NewMD =
|
||||
MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias),
|
||||
MDNode::get(F.getContext(), NoAliases));
|
||||
Inst->setMetadata(LLVMContext::MD_noalias, NewMD);
|
||||
}
|
||||
|
||||
// Collect scopes for alias.scope metadata.
|
||||
if (!UsesAliasingPtr)
|
||||
for (const Argument *Arg : NoAliasArgs) {
|
||||
if (ObjSet.count(Arg))
|
||||
Scopes.push_back(NewScopes[Arg]);
|
||||
}
|
||||
|
||||
// Add alias.scope metadata to instruction.
|
||||
if (!Scopes.empty()) {
|
||||
MDNode *NewMD =
|
||||
MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope),
|
||||
MDNode::get(F.getContext(), Scopes));
|
||||
Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool lowerKernelArguments(Function &F, const TargetMachine &TM,
|
||||
DominatorTree &DT) {
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
|
||||
return false;
|
||||
@@ -86,6 +217,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
|
||||
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
|
||||
|
||||
uint64_t ExplicitArgOffset = 0;
|
||||
|
||||
addAliasScopeMetadata(F, F.getParent()->getDataLayout(), DT);
|
||||
|
||||
for (Argument &Arg : F.args()) {
|
||||
const bool IsByRef = Arg.hasByRefAttr();
|
||||
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
|
||||
@@ -124,11 +258,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
|
||||
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
|
||||
!ST.hasUsableDSOffset())
|
||||
continue;
|
||||
|
||||
// FIXME: We can replace this with equivalent alias.scope/noalias
|
||||
// metadata, but this appears to be a lot of work.
|
||||
if (Arg.hasNoAliasAttr())
|
||||
continue;
|
||||
}
|
||||
|
||||
auto *VT = dyn_cast<FixedVectorType>(ArgTy);
|
||||
@@ -215,8 +344,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Convert noalias arg to !noalias
|
||||
|
||||
if (DoShiftOpt) {
|
||||
Value *ExtractBits = OffsetDiff == 0 ?
|
||||
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
|
||||
@@ -245,7 +372,8 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
|
||||
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
|
||||
auto &TPC = getAnalysis<TargetPassConfig>();
|
||||
const TargetMachine &TM = TPC.getTM<TargetMachine>();
|
||||
return lowerKernelArguments(F, TM);
|
||||
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
return lowerKernelArguments(F, TM, DT);
|
||||
}
|
||||
|
||||
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
|
||||
@@ -261,7 +389,8 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
|
||||
|
||||
PreservedAnalyses
|
||||
AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {
|
||||
bool Changed = lowerKernelArguments(F, TM);
|
||||
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
||||
bool Changed = lowerKernelArguments(F, TM, DT);
|
||||
if (Changed) {
|
||||
// TODO: Preserves a lot more.
|
||||
PreservedAnalyses PA;
|
||||
|
||||
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
@@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v3
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
|
||||
; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
|
||||
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
|
||||
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v3
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -706,20 +706,20 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_readfirstlane_b32 s3, v0
|
||||
; VI-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; VI-NEXT: s_or_b32 s2, s2, s3
|
||||
; VI-NEXT: s_lshl_b32 s3, s2, 16
|
||||
; VI-NEXT: s_flbit_i32_b32 s3, s3
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; VI-NEXT: s_cselect_b32 s2, s3, 32
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_readfirstlane_b32 s0, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_readfirstlane_b32 s1, v3
|
||||
; VI-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; VI-NEXT: s_or_b32 s0, s0, s1
|
||||
; VI-NEXT: s_lshl_b32 s1, s0, 16
|
||||
; VI-NEXT: s_flbit_i32_b32 s1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; VI-NEXT: s_cselect_b32 s0, s1, 32
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -813,37 +813,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s2
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v0
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -964,29 +964,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s4
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v9, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v8, s4
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: flat_load_ubyte v10, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v11, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v12, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v6, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v7, v[8:9]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v8, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
|
||||
; VI-NEXT: s_waitcnt vmcnt(6)
|
||||
@@ -1000,19 +1001,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
|
||||
; VI-NEXT: v_ffbh_u32_e32 v4, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v5, v5, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, v0, v4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 64, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, v2, v4
|
||||
; VI-NEXT: v_min_u32_e32 v2, 64, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
|
||||
@@ -1118,12 +1118,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1258,10 +1258,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1504,11 +1504,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1583,11 +1583,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1660,11 +1660,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1857,13 +1857,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1941,13 +1941,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -2025,13 +2025,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -2110,13 +2110,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@@ -92,11 +92,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -168,8 +168,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
@@ -248,8 +248,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v3
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
@@ -511,13 +511,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
|
||||
; VI-LABEL: s_cttz_zero_undef_i64_with_select:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_ff1_i32_b64 s2, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_ff1_i32_b64 s0, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_cttz_zero_undef_i64_with_select:
|
||||
@@ -581,14 +581,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v2
|
||||
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -677,17 +677,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v2
|
||||
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -778,37 +778,37 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s2
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v0
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -929,55 +929,55 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s4
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v9, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v8, s4
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: flat_load_ubyte v10, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v11, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v12, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v6, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v7, v[8:9]
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v11, s3
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v10, s2
|
||||
; VI-NEXT: flat_load_ubyte v12, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v13, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v6, v[8:9]
|
||||
; VI-NEXT: flat_load_ubyte v7, v[10:11]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_load_ubyte v8, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
|
||||
; VI-NEXT: s_waitcnt vmcnt(6)
|
||||
; VI-NEXT: v_or_b32_e32 v4, v4, v11
|
||||
; VI-NEXT: v_or_b32_e32 v3, v3, v13
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v4, v5, v4
|
||||
; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v3, v4, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
|
||||
; VI-NEXT: v_ffbl_b32_e32 v4, v4
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v4, v4, v7
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v3
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, v4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 64, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v4
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 64, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: v_cttz_zero_undef_i64_with_select:
|
||||
@@ -1091,36 +1091,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s2
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1213,36 +1213,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s2
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1338,39 +1338,39 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_add_u32 s4, s2, 2
|
||||
; VI-NEXT: s_add_u32 s4, s2, 3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_add_u32 s2, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: s_add_u32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: s_addc_u32 s3, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s2
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v3, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
||||
; VI-NEXT: flat_load_ubyte v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v5, v[6:7]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1455,11 +1455,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v0
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@@ -1541,19 +1541,19 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s2, 1
|
||||
; VI-NEXT: s_addc_u32 s5, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_load_ubyte v4, v[0:1]
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@@ -1466,10 +1466,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
|
||||
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
|
||||
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
|
||||
; SI-NEXT: s_mov_b32 s10, -1
|
||||
; SI-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NEXT: s_mov_b32 s2, s10
|
||||
; SI-NEXT: s_mov_b32 s3, s11
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s6, s10
|
||||
; SI-NEXT: s_mov_b32 s7, s11
|
||||
; SI-NEXT: s_mov_b32 s4, s2
|
||||
; SI-NEXT: s_mov_b32 s5, s3
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
@@ -1485,15 +1487,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
|
||||
; SI-NEXT: v_or_b32_e32 v4, v4, v6
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s8, 0x4000405
|
||||
; VI-NEXT: s_mov_b32 s12, 0x4000405
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
|
||||
@@ -1515,10 +1517,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
|
||||
; VI-NEXT: flat_load_ubyte v4, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_mov_b32 s4, s2
|
||||
; VI-NEXT: s_mov_b32 s5, s3
|
||||
; VI-NEXT: s_mov_b32 s2, s6
|
||||
; VI-NEXT: s_mov_b32 s3, s7
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s10, s6
|
||||
; VI-NEXT: s_mov_b32 s11, s7
|
||||
; VI-NEXT: s_mov_b32 s8, s2
|
||||
; VI-NEXT: s_mov_b32 s9, s3
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
|
||||
@@ -1531,9 +1535,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
|
||||
; VI-NEXT: v_or_b32_e32 v4, v5, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, v7, v3
|
||||
; VI-NEXT: v_mov_b32_e32 v3, v1
|
||||
; VI-NEXT: v_perm_b32 v4, v4, v5, s8
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
|
||||
; VI-NEXT: v_perm_b32 v4, v4, v5, s12
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
|
||||
@@ -1628,21 +1632,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
|
||||
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
|
||||
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s3
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, 0
|
||||
; SI-NEXT: s_mov_b32 s3, s7
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s0, s6
|
||||
; SI-NEXT: s_mov_b32 s1, s7
|
||||
; SI-NEXT: s_mov_b32 s6, s2
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
||||
; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
|
||||
@@ -1664,29 +1670,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00
|
||||
; VI-NEXT: v_mov_b32_e32 v6, 9
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1]
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: v_mov_b32_e32 v6, 9
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: v_mov_b32_e32 v7, 0x900
|
||||
; VI-NEXT: s_mov_b32 s10, s6
|
||||
; VI-NEXT: s_mov_b32 s11, s7
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s4, s2
|
||||
; VI-NEXT: s_mov_b32 s5, s3
|
||||
; VI-NEXT: s_mov_b32 s2, s6
|
||||
; VI-NEXT: s_mov_b32 s3, s7
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s8, s2
|
||||
; VI-NEXT: s_mov_b32 s9, s3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
|
||||
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
|
||||
@@ -1696,14 +1704,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
|
||||
; VI-NEXT: v_add_u16_e32 v9, 9, v4
|
||||
; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: s_nop 0
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
|
||||
|
||||
@@ -54,6 +54,7 @@
|
||||
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
|
||||
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
|
||||
; GCN-O0-NEXT: Expand reduction intrinsics
|
||||
; GCN-O0-NEXT: Dominator Tree Construction
|
||||
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
; GCN-O0-NEXT: AMDGPU lower intrinsics
|
||||
@@ -244,8 +245,8 @@
|
||||
; GCN-O1-NEXT: Expand reduction intrinsics
|
||||
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-NEXT: Natural Loop Information
|
||||
; GCN-O1-NEXT: CodeGen Prepare
|
||||
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
@@ -551,8 +552,8 @@
|
||||
; GCN-O1-OPTS-NEXT: Early CSE
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
; GCN-O1-OPTS-NEXT: CodeGen Prepare
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
@@ -874,8 +875,8 @@
|
||||
; GCN-O2-NEXT: Early CSE
|
||||
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: CodeGen Prepare
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
@@ -1212,8 +1213,8 @@
|
||||
; GCN-O3-NEXT: Global Value Numbering
|
||||
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: CodeGen Prepare
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
|
||||
@@ -1119,21 +1119,44 @@ define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 %
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 {
|
||||
; GCN-LABEL: @kern_noalias_global_ptr(
|
||||
; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; GCN-NEXT: store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) poison, align 8
|
||||
; GCN-NEXT: ret void
|
||||
; HSA-LABEL: @kern_noalias_global_ptr(
|
||||
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
|
||||
; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
|
||||
; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
|
||||
; HSA-NEXT: ret void
|
||||
;
|
||||
; MESA-LABEL: @kern_noalias_global_ptr(
|
||||
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
|
||||
; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
|
||||
; MESA-NEXT: ret void
|
||||
;
|
||||
store volatile ptr addrspace(1) %ptr, ptr addrspace(1) poison
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 {
|
||||
; GCN-LABEL: @kern_noalias_global_ptr_x2(
|
||||
; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; GCN-NEXT: store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) poison, align 8
|
||||
; GCN-NEXT: store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) poison, align 8
|
||||
; GCN-NEXT: ret void
|
||||
; HSA-LABEL: @kern_noalias_global_ptr_x2(
|
||||
; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; HSA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 0
|
||||
; HSA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
|
||||
; HSA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 8
|
||||
; HSA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
|
||||
; HSA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
|
||||
; HSA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
|
||||
; HSA-NEXT: ret void
|
||||
;
|
||||
; MESA-LABEL: @kern_noalias_global_ptr_x2(
|
||||
; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; MESA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
|
||||
; MESA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; MESA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 44
|
||||
; MESA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; MESA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
|
||||
; MESA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
|
||||
; MESA-NEXT: ret void
|
||||
;
|
||||
store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) poison
|
||||
store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) poison
|
||||
@@ -1855,10 +1878,24 @@ attributes #2 = { nounwind "target-cpu"="tahiti" }
|
||||
; HSA: [[META2]] = !{i64 42}
|
||||
; HSA: [[META3]] = !{i64 128}
|
||||
; HSA: [[META4]] = !{i64 1024}
|
||||
; HSA: [[META5]] = !{[[META6:![0-9]+]]}
|
||||
; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"ptr"}
|
||||
; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
|
||||
; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
|
||||
; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"ptr0"}
|
||||
; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
|
||||
; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"ptr1"}
|
||||
;.
|
||||
; MESA: [[META0]] = !{}
|
||||
; MESA: [[RNG1]] = !{i32 0, i32 8}
|
||||
; MESA: [[META2]] = !{i64 42}
|
||||
; MESA: [[META3]] = !{i64 128}
|
||||
; MESA: [[META4]] = !{i64 1024}
|
||||
; MESA: [[META5]] = !{[[META6:![0-9]+]]}
|
||||
; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"ptr"}
|
||||
; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
|
||||
; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
|
||||
; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"ptr0"}
|
||||
; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
|
||||
; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"ptr1"}
|
||||
;.
|
||||
|
||||
434
llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll
Normal file
434
llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll
Normal file
@@ -0,0 +1,434 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: opt -mtriple=amdgcn-- -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck %s
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32(
|
||||
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_2I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0:![0-9]+]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4
|
||||
; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3:[0-9]+]]
|
||||
; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%val = load i32, ptr addrspace(1) %in.gep, align 4
|
||||
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
|
||||
store i32 %ctlz, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_2I32_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]]
|
||||
; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]]
|
||||
; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%val = load i32, ptr addrspace(1) %in.gep, align 4
|
||||
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
|
||||
store i32 %ctlz, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_2I32_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
|
||||
; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]]
|
||||
; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2
|
||||
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
|
||||
store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]]
|
||||
; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]]
|
||||
; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META14]], !noalias [[META11]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2
|
||||
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
|
||||
store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_v4f32_3v4i8(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8(
|
||||
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 60
|
||||
; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1
|
||||
; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float>
|
||||
; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16
|
||||
; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
|
||||
%gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
|
||||
%load = load <4 x i8>, ptr addrspace(1) %gep, align 1
|
||||
%load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
|
||||
%shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
%cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
|
||||
store <4 x float> %cvt, ptr addrspace(1) %out, align 16
|
||||
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 60
|
||||
; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META16:![0-9]+]], !noalias [[META19:![0-9]+]]
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META23:![0-9]+]], !noalias [[META24:![0-9]+]]
|
||||
; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float>
|
||||
; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META25:![0-9]+]], !noalias [[META26:![0-9]+]]
|
||||
; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META28:![0-9]+]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
|
||||
%gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
|
||||
%load = load <4 x i8>, ptr addrspace(1) %gep, align 1
|
||||
%load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
|
||||
%shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
%cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
|
||||
store <4 x float> %cvt, ptr addrspace(1) %out, align 16
|
||||
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 60
|
||||
; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META6]], !noalias [[META9]]
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META6]], !noalias [[META9]]
|
||||
; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float>
|
||||
; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META9]], !noalias [[META6]]
|
||||
; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
|
||||
%gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
|
||||
%load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2
|
||||
%load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2
|
||||
%shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
%cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
|
||||
store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4
|
||||
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 60
|
||||
; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]]
|
||||
; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META29:![0-9]+]], !noalias [[META32:![0-9]+]]
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META36:![0-9]+]], !noalias [[META37:![0-9]+]]
|
||||
; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float>
|
||||
; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META38:![0-9]+]], !noalias [[META39:![0-9]+]]
|
||||
; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META40:![0-9]+]], !noalias [[META41:![0-9]+]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
|
||||
%gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
|
||||
%load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2
|
||||
%load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2
|
||||
%shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
|
||||
%cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
|
||||
store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4
|
||||
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_mixed_intrinsics(ptr addrspace(1) %in, ptr addrspace(1) %inout, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics(
|
||||
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[INOUT:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]])
|
||||
; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]])
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false)
|
||||
; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]]
|
||||
; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in)
|
||||
%val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout)
|
||||
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false)
|
||||
%val3 = fmul <4 x float> %val1, %val2
|
||||
store <4 x float> %val3, ptr addrspace(1) %inout
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %inout, ptr addrspace(1) noalias %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[INOUT:%.*]], ptr addrspace(1) noalias [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META42:![0-9]+]], !noalias [[META45:![0-9]+]]
|
||||
; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META48:![0-9]+]], !noalias [[META49:![0-9]+]]
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META45]], !noalias [[META42]]
|
||||
; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]]
|
||||
; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META48]], !noalias [[META49]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in)
|
||||
%val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout)
|
||||
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false)
|
||||
%val3 = fmul <4 x float> %val1, %val2
|
||||
store <4 x float> %val3, ptr addrspace(1) %inout
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_mixed_intrinsics_AS(ptr addrspace(1) %in, ptr addrspace(1) %inout, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[INOUT:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META6]], !noalias [[META9]]
|
||||
; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META6]], !noalias [[META9]]
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META9]], !noalias [[META6]]
|
||||
; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]]
|
||||
; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META9]], !noalias [[META6]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in), !alias.scope !4, !noalias !2
|
||||
%val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout), !alias.scope !4, !noalias !2
|
||||
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false), !alias.scope !2, !noalias !4
|
||||
%val3 = fmul <4 x float> %val1, %val2
|
||||
store <4 x float> %val3, ptr addrspace(1) %inout, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA_AS(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %inout, ptr addrspace(1) noalias %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA_AS(
|
||||
; CHECK-SAME: ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[INOUT:%.*]], ptr addrspace(1) noalias [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
||||
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 36
|
||||
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 44
|
||||
; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 52
|
||||
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META50:![0-9]+]], !noalias [[META53:![0-9]+]]
|
||||
; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META56:![0-9]+]], !noalias [[META57:![0-9]+]]
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META53]], !noalias [[META50]]
|
||||
; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]]
|
||||
; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META58:![0-9]+]], !noalias [[META59:![0-9]+]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in), !alias.scope !4, !noalias !2
|
||||
%val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout), !alias.scope !4, !noalias !2
|
||||
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false), !alias.scope !2, !noalias !4
|
||||
%val3 = fmul <4 x float> %val1, %val2
|
||||
store <4 x float> %val3, ptr addrspace(1) %inout, !alias.scope !2, !noalias !4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
declare <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1))
|
||||
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1), ptr addrspace(1), i64, i1)
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readnone speculatable }
|
||||
|
||||
!0 = distinct !{!0, !"alias_scope_0"}
|
||||
!1 = distinct !{!1, !0, !"alias_scope_1"}
|
||||
!2 = !{!1}
|
||||
!3 = distinct !{!3, !0, !"alias_scope_3"}
|
||||
!4 = !{!3}
|
||||
;.
|
||||
; CHECK: [[META0]] = !{}
|
||||
; CHECK: [[META1]] = !{[[META2:![0-9]+]]}
|
||||
; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"in"}
|
||||
; CHECK: [[META3]] = distinct !{[[META3]], !"aliasinfo_2i32_NA"}
|
||||
; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
|
||||
; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"out"}
|
||||
; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
|
||||
; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"alias_scope_3"}
|
||||
; CHECK: [[META8]] = distinct !{[[META8]], !"alias_scope_0"}
|
||||
; CHECK: [[META9]] = !{[[META10:![0-9]+]]}
|
||||
; CHECK: [[META10]] = distinct !{[[META10]], [[META8]], !"alias_scope_1"}
|
||||
; CHECK: [[META11]] = !{[[META7]], [[META12:![0-9]+]]}
|
||||
; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"in"}
|
||||
; CHECK: [[META13]] = distinct !{[[META13]], !"aliasinfo_2i32_NA_AS"}
|
||||
; CHECK: [[META14]] = !{[[META10]], [[META15:![0-9]+]]}
|
||||
; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"out"}
|
||||
; CHECK: [[META16]] = !{[[META17:![0-9]+]]}
|
||||
; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"in"}
|
||||
; CHECK: [[META18]] = distinct !{[[META18]], !"aliasinfo_v4f32_3v4i8_NA"}
|
||||
; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META21:![0-9]+]], [[META22:![0-9]+]]}
|
||||
; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"out"}
|
||||
; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"out1"}
|
||||
; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"in1"}
|
||||
; CHECK: [[META23]] = !{[[META22]]}
|
||||
; CHECK: [[META24]] = !{[[META20]], [[META21]], [[META17]]}
|
||||
; CHECK: [[META25]] = !{[[META20]]}
|
||||
; CHECK: [[META26]] = !{[[META21]], [[META17]], [[META22]]}
|
||||
; CHECK: [[META27]] = !{[[META21]]}
|
||||
; CHECK: [[META28]] = !{[[META20]], [[META17]], [[META22]]}
|
||||
; CHECK: [[META29]] = !{[[META7]], [[META30:![0-9]+]]}
|
||||
; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"in"}
|
||||
; CHECK: [[META31]] = distinct !{[[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS"}
|
||||
; CHECK: [[META32]] = !{[[META10]], [[META33:![0-9]+]], [[META34:![0-9]+]], [[META35:![0-9]+]]}
|
||||
; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"out"}
|
||||
; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"out1"}
|
||||
; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"in1"}
|
||||
; CHECK: [[META36]] = !{[[META7]], [[META35]]}
|
||||
; CHECK: [[META37]] = !{[[META10]], [[META33]], [[META34]], [[META30]]}
|
||||
; CHECK: [[META38]] = !{[[META10]], [[META33]]}
|
||||
; CHECK: [[META39]] = !{[[META7]], [[META34]], [[META30]], [[META35]]}
|
||||
; CHECK: [[META40]] = !{[[META10]], [[META34]]}
|
||||
; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]}
|
||||
; CHECK: [[META42]] = !{[[META43:![0-9]+]]}
|
||||
; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"in"}
|
||||
; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_mixed_intrinsics_NA"}
|
||||
; CHECK: [[META45]] = !{[[META46:![0-9]+]], [[META47:![0-9]+]]}
|
||||
; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"inout"}
|
||||
; CHECK: [[META47]] = distinct !{[[META47]], [[META44]], !"out"}
|
||||
; CHECK: [[META48]] = !{[[META46]]}
|
||||
; CHECK: [[META49]] = !{[[META43]], [[META47]]}
|
||||
; CHECK: [[META50]] = !{[[META7]], [[META51:![0-9]+]]}
|
||||
; CHECK: [[META51]] = distinct !{[[META51]], [[META52:![0-9]+]], !"in"}
|
||||
; CHECK: [[META52]] = distinct !{[[META52]], !"aliasinfo_mixed_intrinsics_NA_AS"}
|
||||
; CHECK: [[META53]] = !{[[META10]], [[META54:![0-9]+]], [[META55:![0-9]+]]}
|
||||
; CHECK: [[META54]] = distinct !{[[META54]], [[META52]], !"inout"}
|
||||
; CHECK: [[META55]] = distinct !{[[META55]], [[META52]], !"out"}
|
||||
; CHECK: [[META56]] = !{[[META7]], [[META54]]}
|
||||
; CHECK: [[META57]] = !{[[META10]], [[META51]], [[META55]]}
|
||||
; CHECK: [[META58]] = !{[[META10]], [[META54]]}
|
||||
; CHECK: [[META59]] = !{[[META7]], [[META51]], [[META55]]}
|
||||
;.
|
||||
@@ -1047,46 +1047,40 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
|
||||
define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
|
||||
; SI-STD-LABEL: aggressive_combine_to_mad_fsub_0_f32:
|
||||
; SI-STD: ; %bb.0:
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s6, 0
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB12_2
|
||||
; SI-STD-NEXT: ; %bb.1: ; %normal
|
||||
; SI-STD-NEXT: v_mul_f32_e32 v4, v6, v1
|
||||
; SI-STD-NEXT: v_fma_f32 v4, v2, v3, v4
|
||||
; SI-STD-NEXT: v_sub_f32_e32 v4, v4, v5
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-STD-NEXT: s_branch .LBB12_3
|
||||
; SI-STD-NEXT: s_cbranch_execz .LBB12_3
|
||||
; SI-STD-NEXT: s_branch .LBB12_4
|
||||
; SI-STD-NEXT: .LBB12_2:
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-STD-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-STD-NEXT: .LBB12_3: ; %Flow
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB12_5
|
||||
; SI-STD-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-STD-NEXT: .LBB12_3: ; %aggressive
|
||||
; SI-STD-NEXT: v_mad_f32 v4, v6, v1, -v5
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3
|
||||
; SI-STD-NEXT: .LBB12_5: ; %exit
|
||||
; SI-STD-NEXT: .LBB12_4: ; %exit
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
@@ -1095,93 +1089,81 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1)
|
||||
;
|
||||
; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_0_f32:
|
||||
; SI-DENORM-FASTFMAF: ; %bb.0:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB12_2
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v6
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v6, v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB12_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v4, v6, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v4, v4, v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB12_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB12_4
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB12_2:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB12_3: ; %Flow
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB12_5
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v5, v1, -v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB12_5: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB12_3: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v6, v1, -v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB12_4: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_endpgm
|
||||
;
|
||||
; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_0_f32:
|
||||
; SI-DENORM-SLOWFMAF: ; %bb.0:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_fma_f32 v1, v3, v4, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB12_2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB12_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB12_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB12_4
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB12_2:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB12_3: ; %Flow
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB12_5
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB12_3: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB12_5: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB12_4: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
@@ -1275,142 +1257,124 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1)
|
||||
define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
|
||||
; SI-STD-LABEL: aggressive_combine_to_mad_fsub_2_f32:
|
||||
; SI-STD: ; %bb.0:
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s6, 0
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB14_2
|
||||
; SI-STD-NEXT: ; %bb.1: ; %normal
|
||||
; SI-STD-NEXT: v_mul_f32_e32 v5, v6, v1
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v5, v2, v3
|
||||
; SI-STD-NEXT: v_sub_f32_e32 v5, v5, v4
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-STD-NEXT: s_branch .LBB14_3
|
||||
; SI-STD-NEXT: v_mul_f32_e32 v4, v6, v1
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3
|
||||
; SI-STD-NEXT: v_sub_f32_e32 v4, v4, v5
|
||||
; SI-STD-NEXT: s_cbranch_execz .LBB14_3
|
||||
; SI-STD-NEXT: s_branch .LBB14_4
|
||||
; SI-STD-NEXT: .LBB14_2:
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-STD-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-STD-NEXT: .LBB14_3: ; %Flow
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB14_5
|
||||
; SI-STD-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-STD-NEXT: v_mad_f32 v5, v6, v1, -v4
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v5, v2, v3
|
||||
; SI-STD-NEXT: .LBB14_5: ; %exit
|
||||
; SI-STD-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-STD-NEXT: .LBB14_3: ; %aggressive
|
||||
; SI-STD-NEXT: v_mad_f32 v4, v6, v1, -v5
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3
|
||||
; SI-STD-NEXT: .LBB14_4: ; %exit
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-STD-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64
|
||||
; SI-STD-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
|
||||
; SI-STD-NEXT: s_endpgm
|
||||
;
|
||||
; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_2_f32:
|
||||
; SI-DENORM-FASTFMAF: ; %bb.0:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB14_2
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v6
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v6, v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB14_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v4, v6, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v4, v4, v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB14_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB14_4
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB14_2:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB14_3: ; %Flow
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB14_5
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v5, v1, -v4
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB14_5: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB14_3: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v6, v1, -v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB14_4: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_endpgm
|
||||
;
|
||||
; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_2_f32:
|
||||
; SI-DENORM-SLOWFMAF: ; %bb.0:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_add_f32_e32 v1, v3, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB14_2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB14_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB14_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB14_4
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB14_2:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB14_3: ; %Flow
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB14_5
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB14_3: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB14_5: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB14_4: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
@@ -1455,142 +1419,124 @@ exit:
|
||||
define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
|
||||
; SI-STD-LABEL: aggressive_combine_to_mad_fsub_3_f32:
|
||||
; SI-STD: ; %bb.0:
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s6, 0
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-STD-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-STD-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB15_2
|
||||
; SI-STD-NEXT: ; %bb.1: ; %normal
|
||||
; SI-STD-NEXT: v_mul_f32_e32 v6, v5, v1
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v6, v3, v4
|
||||
; SI-STD-NEXT: v_sub_f32_e32 v6, v2, v6
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-STD-NEXT: s_branch .LBB15_3
|
||||
; SI-STD-NEXT: v_mul_f32_e32 v5, v6, v1
|
||||
; SI-STD-NEXT: v_mac_f32_e32 v5, v3, v4
|
||||
; SI-STD-NEXT: v_sub_f32_e32 v5, v2, v5
|
||||
; SI-STD-NEXT: s_cbranch_execz .LBB15_3
|
||||
; SI-STD-NEXT: s_branch .LBB15_4
|
||||
; SI-STD-NEXT: .LBB15_2:
|
||||
; SI-STD-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-STD-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-STD-NEXT: .LBB15_3: ; %Flow
|
||||
; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-STD-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-STD-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-STD-NEXT: s_cbranch_vccnz .LBB15_5
|
||||
; SI-STD-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-STD-NEXT: v_mad_f32 v1, -v5, v1, v2
|
||||
; SI-STD-NEXT: v_mad_f32 v6, -v3, v4, v1
|
||||
; SI-STD-NEXT: .LBB15_5: ; %exit
|
||||
; SI-STD-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-STD-NEXT: .LBB15_3: ; %aggressive
|
||||
; SI-STD-NEXT: v_mad_f32 v1, -v6, v1, v2
|
||||
; SI-STD-NEXT: v_mad_f32 v5, -v3, v4, v1
|
||||
; SI-STD-NEXT: .LBB15_4: ; %exit
|
||||
; SI-STD-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-STD-NEXT: s_mov_b32 s2, 0
|
||||
; SI-STD-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-STD-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64
|
||||
; SI-STD-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64
|
||||
; SI-STD-NEXT: s_endpgm
|
||||
;
|
||||
; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_3_f32:
|
||||
; SI-DENORM-FASTFMAF: ; %bb.0:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB15_2
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v3, v4, v6
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v2, v6
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB15_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v5, v6, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v5, v3, v4, v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v5, v2, v5
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB15_3
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB15_4
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB15_2:
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB15_3: ; %Flow
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB15_5
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, -v5, v1, v2
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, -v3, v4, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB15_5: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB15_3: ; %aggressive
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, -v6, v1, v2
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v5, -v3, v4, v1
|
||||
; SI-DENORM-FASTFMAF-NEXT: .LBB15_4: ; %exit
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64
|
||||
; SI-DENORM-FASTFMAF-NEXT: s_endpgm
|
||||
;
|
||||
; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_3_f32:
|
||||
; SI-DENORM-SLOWFMAF: ; %bb.0:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_add_f32_e32 v1, v3, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB15_2
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v2, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB15_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB15_3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB15_4
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB15_2:
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB15_3: ; %Flow
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB15_5
|
||||
; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB15_3: ; %aggressive
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v2, v1
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB15_5: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: .LBB15_4: ; %exit
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0
|
||||
; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0
|
||||
|
||||
@@ -646,13 +646,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
|
||||
define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
|
||||
; GFX6-LABEL: s_sub_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_sub_u32 s0, s0, s2
|
||||
; GFX6-NEXT: s_subb_u32 s1, s1, s3
|
||||
; GFX6-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s2, s8
|
||||
; GFX6-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-NEXT: s_subb_u32 s1, s3, s9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
@@ -660,41 +662,41 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
|
||||
;
|
||||
; GFX8-LABEL: s_sub_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_u32 s0, s0, s2
|
||||
; GFX8-NEXT: s_subb_u32 s1, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: s_subb_u32 s1, s3, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_sub_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_u32 s0, s0, s2
|
||||
; GFX9-NEXT: s_subb_u32 s1, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_sub_u32 s2, s2, s6
|
||||
; GFX9-NEXT: s_subb_u32 s3, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: s_sub_i64:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
%result = sub i64 %a, %b
|
||||
store i64 %result, ptr addrspace(1) %out, align 8
|
||||
@@ -739,12 +741,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
|
||||
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: v_sub_i64:
|
||||
@@ -831,14 +833,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
|
||||
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
|
||||
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
|
||||
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
|
||||
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: v_test_sub_v2i64:
|
||||
|
||||
@@ -298,35 +298,33 @@ exit:
|
||||
define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 {
|
||||
; SI-LABEL: multi_vcond_loop:
|
||||
; SI: ; %bb.0: ; %bb
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf
|
||||
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
||||
; SI-NEXT: v_mov_b32_e32 v7, 0
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v0, v[6:7], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
|
||||
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
||||
; SI-NEXT: s_cbranch_execz .LBB5_5
|
||||
; SI-NEXT: ; %bb.1: ; %bb10.preheader
|
||||
; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
||||
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SI-NEXT: v_add_i32_e32 v6, vcc, s2, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; SI-NEXT: s_mov_b64 s[2:3], 0
|
||||
; SI-NEXT: s_mov_b32 s8, s10
|
||||
; SI-NEXT: s_mov_b32 s9, s10
|
||||
; SI-NEXT: ; implicit-def: $sgpr4_sgpr5
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SI-NEXT: v_add_i32_e32 v4, vcc, s0, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SI-NEXT: v_add_i32_e32 v6, vcc, s14, v6
|
||||
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; SI-NEXT: s_mov_b64 s[6:7], 0
|
||||
; SI-NEXT: .LBB5_2: ; %bb10
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
|
||||
Reference in New Issue
Block a user