VPlanSLP hasn't seen much progress since it was checked in 7 years ago, and it is unclear if there ever will be any progress. Strip it from the tree to avoid confusion.
4693 lines
174 KiB
C++
4693 lines
174 KiB
C++
//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file contains implementations for different VPlan recipes.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "LoopVectorizationPlanner.h"
|
|
#include "VPlan.h"
|
|
#include "VPlanAnalysis.h"
|
|
#include "VPlanHelpers.h"
|
|
#include "VPlanPatternMatch.h"
|
|
#include "VPlanUtils.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/Twine.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/IVDescriptors.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
|
#include "llvm/IR/BasicBlock.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
|
#include <cassert>
|
|
|
|
using namespace llvm;
|
|
using namespace llvm::VPlanPatternMatch;
|
|
|
|
using VectorParts = SmallVector<Value *, 2>;
|
|
|
|
#define LV_NAME "loop-vectorize"
|
|
#define DEBUG_TYPE LV_NAME
|
|
|
|
bool VPRecipeBase::mayWriteToMemory() const {
|
|
switch (getVPRecipeID()) {
|
|
case VPExpressionSC:
|
|
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
|
|
case VPInstructionSC: {
|
|
auto *VPI = cast<VPInstruction>(this);
|
|
// Loads read from memory but don't write to memory.
|
|
if (VPI->getOpcode() == Instruction::Load)
|
|
return false;
|
|
return VPI->opcodeMayReadOrWriteFromMemory();
|
|
}
|
|
case VPInterleaveEVLSC:
|
|
case VPInterleaveSC:
|
|
return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
|
|
case VPWidenStoreEVLSC:
|
|
case VPWidenStoreSC:
|
|
return true;
|
|
case VPReplicateSC:
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
->mayWriteToMemory();
|
|
case VPWidenCallSC:
|
|
return !cast<VPWidenCallRecipe>(this)
|
|
->getCalledScalarFunction()
|
|
->onlyReadsMemory();
|
|
case VPWidenIntrinsicSC:
|
|
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
|
|
case VPActiveLaneMaskPHISC:
|
|
case VPCurrentIterationPHISC:
|
|
case VPBranchOnMaskSC:
|
|
case VPDerivedIVSC:
|
|
case VPFirstOrderRecurrencePHISC:
|
|
case VPReductionPHISC:
|
|
case VPScalarIVStepsSC:
|
|
case VPPredInstPHISC:
|
|
return false;
|
|
case VPBlendSC:
|
|
case VPReductionEVLSC:
|
|
case VPReductionSC:
|
|
case VPVectorPointerSC:
|
|
case VPWidenCanonicalIVSC:
|
|
case VPWidenCastSC:
|
|
case VPWidenGEPSC:
|
|
case VPWidenIntOrFpInductionSC:
|
|
case VPWidenLoadEVLSC:
|
|
case VPWidenLoadSC:
|
|
case VPWidenPHISC:
|
|
case VPWidenPointerInductionSC:
|
|
case VPWidenSC: {
|
|
const Instruction *I =
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
(void)I;
|
|
assert((!I || !I->mayWriteToMemory()) &&
|
|
"underlying instruction may write to memory");
|
|
return false;
|
|
}
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool VPRecipeBase::mayReadFromMemory() const {
|
|
switch (getVPRecipeID()) {
|
|
case VPExpressionSC:
|
|
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
|
|
case VPInstructionSC:
|
|
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
|
|
case VPWidenLoadEVLSC:
|
|
case VPWidenLoadSC:
|
|
return true;
|
|
case VPReplicateSC:
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
->mayReadFromMemory();
|
|
case VPWidenCallSC:
|
|
return !cast<VPWidenCallRecipe>(this)
|
|
->getCalledScalarFunction()
|
|
->onlyWritesMemory();
|
|
case VPWidenIntrinsicSC:
|
|
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
|
|
case VPBranchOnMaskSC:
|
|
case VPDerivedIVSC:
|
|
case VPCurrentIterationPHISC:
|
|
case VPFirstOrderRecurrencePHISC:
|
|
case VPReductionPHISC:
|
|
case VPPredInstPHISC:
|
|
case VPScalarIVStepsSC:
|
|
case VPWidenStoreEVLSC:
|
|
case VPWidenStoreSC:
|
|
return false;
|
|
case VPBlendSC:
|
|
case VPReductionEVLSC:
|
|
case VPReductionSC:
|
|
case VPVectorPointerSC:
|
|
case VPWidenCanonicalIVSC:
|
|
case VPWidenCastSC:
|
|
case VPWidenGEPSC:
|
|
case VPWidenIntOrFpInductionSC:
|
|
case VPWidenPHISC:
|
|
case VPWidenPointerInductionSC:
|
|
case VPWidenSC: {
|
|
const Instruction *I =
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
(void)I;
|
|
assert((!I || !I->mayReadFromMemory()) &&
|
|
"underlying instruction may read from memory");
|
|
return false;
|
|
}
|
|
default:
|
|
// FIXME: Return false if the recipe represents an interleaved store.
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool VPRecipeBase::mayHaveSideEffects() const {
|
|
switch (getVPRecipeID()) {
|
|
case VPExpressionSC:
|
|
return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
|
|
case VPActiveLaneMaskPHISC:
|
|
case VPDerivedIVSC:
|
|
case VPCurrentIterationPHISC:
|
|
case VPFirstOrderRecurrencePHISC:
|
|
case VPReductionPHISC:
|
|
case VPPredInstPHISC:
|
|
case VPVectorEndPointerSC:
|
|
return false;
|
|
case VPInstructionSC: {
|
|
auto *VPI = cast<VPInstruction>(this);
|
|
return mayWriteToMemory() ||
|
|
VPI->getOpcode() == VPInstruction::BranchOnCount ||
|
|
VPI->getOpcode() == VPInstruction::BranchOnCond ||
|
|
VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
|
|
}
|
|
case VPWidenCallSC: {
|
|
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
|
|
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
|
|
}
|
|
case VPWidenIntrinsicSC:
|
|
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
|
|
case VPBlendSC:
|
|
case VPReductionEVLSC:
|
|
case VPReductionSC:
|
|
case VPScalarIVStepsSC:
|
|
case VPVectorPointerSC:
|
|
case VPWidenCanonicalIVSC:
|
|
case VPWidenCastSC:
|
|
case VPWidenGEPSC:
|
|
case VPWidenIntOrFpInductionSC:
|
|
case VPWidenPHISC:
|
|
case VPWidenPointerInductionSC:
|
|
case VPWidenSC: {
|
|
const Instruction *I =
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
(void)I;
|
|
assert((!I || !I->mayHaveSideEffects()) &&
|
|
"underlying instruction has side-effects");
|
|
return false;
|
|
}
|
|
case VPInterleaveEVLSC:
|
|
case VPInterleaveSC:
|
|
return mayWriteToMemory();
|
|
case VPWidenLoadEVLSC:
|
|
case VPWidenLoadSC:
|
|
case VPWidenStoreEVLSC:
|
|
case VPWidenStoreSC:
|
|
assert(
|
|
cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
|
|
mayWriteToMemory() &&
|
|
"mayHaveSideffects result for ingredient differs from this "
|
|
"implementation");
|
|
return mayWriteToMemory();
|
|
case VPReplicateSC: {
|
|
auto *R = cast<VPReplicateRecipe>(this);
|
|
return R->getUnderlyingInstr()->mayHaveSideEffects();
|
|
}
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
assert(InsertPos->getParent() &&
|
|
"Insertion position not in any VPBasicBlock");
|
|
InsertPos->getParent()->insert(this, InsertPos->getIterator());
|
|
}
|
|
|
|
void VPRecipeBase::insertBefore(VPBasicBlock &BB,
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
assert(I == BB.end() || I->getParent() == &BB);
|
|
BB.insert(this, I);
|
|
}
|
|
|
|
void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
assert(InsertPos->getParent() &&
|
|
"Insertion position not in any VPBasicBlock");
|
|
InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
|
|
}
|
|
|
|
void VPRecipeBase::removeFromParent() {
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
getParent()->getRecipeList().remove(getIterator());
|
|
Parent = nullptr;
|
|
}
|
|
|
|
iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
return getParent()->getRecipeList().erase(getIterator());
|
|
}
|
|
|
|
void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
|
|
removeFromParent();
|
|
insertAfter(InsertPos);
|
|
}
|
|
|
|
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
removeFromParent();
|
|
insertBefore(BB, I);
|
|
}
|
|
|
|
InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
|
|
// Get the underlying instruction for the recipe, if there is one. It is used
|
|
// to
|
|
// * decide if cost computation should be skipped for this recipe,
|
|
// * apply forced target instruction cost.
|
|
Instruction *UI = nullptr;
|
|
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
|
|
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
|
|
else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
|
|
UI = IG->getInsertPos();
|
|
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
|
|
UI = &WidenMem->getIngredient();
|
|
|
|
InstructionCost RecipeCost;
|
|
if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
|
|
RecipeCost = 0;
|
|
} else {
|
|
RecipeCost = computeCost(VF, Ctx);
|
|
if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
|
|
RecipeCost.isValid()) {
|
|
if (UI)
|
|
RecipeCost = InstructionCost(ForceTargetInstructionCost);
|
|
else
|
|
RecipeCost = InstructionCost(0);
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
|
|
dump();
|
|
});
|
|
return RecipeCost;
|
|
}
|
|
|
|
InstructionCost VPRecipeBase::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
llvm_unreachable("subclasses should implement computeCost");
|
|
}
|
|
|
|
bool VPRecipeBase::isPhi() const {
|
|
return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
|
|
isa<VPPhi, VPIRPhi>(this);
|
|
}
|
|
|
|
bool VPRecipeBase::isScalarCast() const {
|
|
auto *VPI = dyn_cast<VPInstruction>(this);
|
|
return VPI && Instruction::isCast(VPI->getOpcode());
|
|
}
|
|
|
|
void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
|
|
assert(OpType == Other.OpType && "OpType must match");
|
|
switch (OpType) {
|
|
case OperationType::OverflowingBinOp:
|
|
WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
|
|
WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
|
|
break;
|
|
case OperationType::Trunc:
|
|
TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
|
|
TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
|
|
break;
|
|
case OperationType::DisjointOp:
|
|
DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
|
|
break;
|
|
case OperationType::PossiblyExactOp:
|
|
ExactFlags.IsExact &= Other.ExactFlags.IsExact;
|
|
break;
|
|
case OperationType::GEPOp:
|
|
GEPFlagsStorage &= Other.GEPFlagsStorage;
|
|
break;
|
|
case OperationType::FPMathOp:
|
|
case OperationType::FCmp:
|
|
assert((OpType != OperationType::FCmp ||
|
|
FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
|
|
"Cannot drop CmpPredicate");
|
|
getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
|
|
getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
|
|
break;
|
|
case OperationType::NonNegOp:
|
|
NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
|
|
break;
|
|
case OperationType::Cmp:
|
|
assert(CmpPredStorage == Other.CmpPredStorage &&
|
|
"Cannot drop CmpPredicate");
|
|
break;
|
|
case OperationType::ReductionOp:
|
|
assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
|
|
"Cannot change RecurKind");
|
|
assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
|
|
"Cannot change IsOrdered");
|
|
assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
|
|
"Cannot change IsInLoop");
|
|
getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
|
|
getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
|
|
break;
|
|
case OperationType::Other:
|
|
break;
|
|
}
|
|
}
|
|
|
|
FastMathFlags VPIRFlags::getFastMathFlags() const {
|
|
assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
|
|
OpType == OperationType::ReductionOp ||
|
|
OpType == OperationType::Other) &&
|
|
"recipe doesn't have fast math flags");
|
|
if (OpType == OperationType::Other)
|
|
return FastMathFlags();
|
|
const FastMathFlagsTy &F = getFMFsRef();
|
|
FastMathFlags Res;
|
|
Res.setAllowReassoc(F.AllowReassoc);
|
|
Res.setNoNaNs(F.NoNaNs);
|
|
Res.setNoInfs(F.NoInfs);
|
|
Res.setNoSignedZeros(F.NoSignedZeros);
|
|
Res.setAllowReciprocal(F.AllowReciprocal);
|
|
Res.setAllowContract(F.AllowContract);
|
|
Res.setApproxFunc(F.ApproxFunc);
|
|
return Res;
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
|
|
|
|
void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
printRecipe(O, Indent, SlotTracker);
|
|
if (auto DL = getDebugLoc()) {
|
|
O << ", !dbg ";
|
|
DL.print(O);
|
|
}
|
|
|
|
if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
|
|
Metadata->print(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
template <unsigned PartOpIdx>
|
|
VPValue *
|
|
VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
|
|
if (U.getNumOperands() == PartOpIdx + 1)
|
|
return U.getOperand(PartOpIdx);
|
|
return nullptr;
|
|
}
|
|
|
|
template <unsigned PartOpIdx>
|
|
unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
|
|
if (auto *UnrollPartOp = getUnrollPartOperand(U))
|
|
return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
|
|
return 0;
|
|
}
|
|
|
|
namespace llvm {
|
|
template class VPUnrollPartAccessor<1>;
|
|
template class VPUnrollPartAccessor<2>;
|
|
template class VPUnrollPartAccessor<3>;
|
|
}
|
|
|
|
VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
|
|
const VPIRFlags &Flags, const VPIRMetadata &MD,
|
|
DebugLoc DL, const Twine &Name)
|
|
: VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
|
|
VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
|
|
assert(flagsValidForOpcode(getOpcode()) &&
|
|
"Set flags not supported for the provided opcode");
|
|
assert(hasRequiredFlagsForOpcode(getOpcode()) &&
|
|
"Opcode requires specific flags to be set");
|
|
assert((getNumOperandsForOpcode() == -1u ||
|
|
getNumOperandsForOpcode() == getNumOperands() ||
|
|
(isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) &&
|
|
"number of operands does not match opcode");
|
|
}
|
|
|
|
/// For call VPInstructions, return the operand index of the called function.
|
|
/// The function is either the last operand (for unmasked calls) or the
|
|
/// second-to-last operand (for masked calls).
|
|
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
|
|
assert(VPI.getOpcode() == Instruction::Call && "must be a call");
|
|
unsigned NumOps = VPI.getNumOperands();
|
|
auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
|
|
if (LastOp && isa<Function>(LastOp->getValue()))
|
|
return NumOps - 1;
|
|
assert(
|
|
isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
|
|
"expected function operand");
|
|
return NumOps - 2;
|
|
}
|
|
|
|
/// For call VPInstructions, return the called function.
|
|
static Function *getCalledFunction(const VPInstruction &VPI) {
|
|
unsigned Idx = getCalledFnOperandIndex(VPI);
|
|
return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
|
|
}
|
|
|
|
unsigned VPInstruction::getNumOperandsForOpcode() const {
|
|
if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
|
|
return 1;
|
|
|
|
if (Instruction::isBinaryOp(Opcode))
|
|
return 2;
|
|
|
|
switch (Opcode) {
|
|
case VPInstruction::StepVector:
|
|
case VPInstruction::VScale:
|
|
return 0;
|
|
case Instruction::Alloca:
|
|
case Instruction::ExtractValue:
|
|
case Instruction::Freeze:
|
|
case Instruction::Load:
|
|
case VPInstruction::BranchOnCond:
|
|
case VPInstruction::Broadcast:
|
|
case VPInstruction::ExitingIVValue:
|
|
case VPInstruction::ExplicitVectorLength:
|
|
case VPInstruction::ExtractLastLane:
|
|
case VPInstruction::ExtractLastPart:
|
|
case VPInstruction::ExtractPenultimateElement:
|
|
case VPInstruction::MaskedCond:
|
|
case VPInstruction::Not:
|
|
case VPInstruction::ResumeForEpilogue:
|
|
case VPInstruction::Reverse:
|
|
case VPInstruction::Unpack:
|
|
return 1;
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp:
|
|
case Instruction::ExtractElement:
|
|
case Instruction::Store:
|
|
case VPInstruction::BranchOnCount:
|
|
case VPInstruction::BranchOnTwoConds:
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
|
case VPInstruction::LogicalAnd:
|
|
case VPInstruction::LogicalOr:
|
|
case VPInstruction::PtrAdd:
|
|
case VPInstruction::WidePtrAdd:
|
|
case VPInstruction::WideIVStep:
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
return 2;
|
|
case Instruction::Select:
|
|
case VPInstruction::ActiveLaneMask:
|
|
case VPInstruction::ReductionStartVector:
|
|
return 3;
|
|
case Instruction::Call:
|
|
return getCalledFnOperandIndex(*this) + 1;
|
|
case Instruction::GetElementPtr:
|
|
case Instruction::PHI:
|
|
case Instruction::Switch:
|
|
case VPInstruction::AnyOf:
|
|
case VPInstruction::BuildStructVector:
|
|
case VPInstruction::BuildVector:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
case VPInstruction::ComputeReductionResult:
|
|
case VPInstruction::FirstActiveLane:
|
|
case VPInstruction::LastActiveLane:
|
|
case VPInstruction::ExtractLane:
|
|
case VPInstruction::ExtractLastActive:
|
|
// Cannot determine the number of operands from the opcode.
|
|
return -1u;
|
|
}
|
|
llvm_unreachable("all cases should be handled above");
|
|
}
|
|
|
|
bool VPInstruction::doesGeneratePerAllLanes() const {
|
|
return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
|
|
}
|
|
|
|
bool VPInstruction::canGenerateScalarForFirstLane() const {
|
|
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
|
|
return true;
|
|
if (isSingleScalar() || isVectorToScalar())
|
|
return true;
|
|
switch (Opcode) {
|
|
case Instruction::Freeze:
|
|
case Instruction::ICmp:
|
|
case Instruction::PHI:
|
|
case Instruction::Select:
|
|
case VPInstruction::BranchOnCond:
|
|
case VPInstruction::BranchOnTwoConds:
|
|
case VPInstruction::BranchOnCount:
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
case VPInstruction::PtrAdd:
|
|
case VPInstruction::ExplicitVectorLength:
|
|
case VPInstruction::AnyOf:
|
|
case VPInstruction::Not:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Value *VPInstruction::generate(VPTransformState &State) {
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
if (Instruction::isBinaryOp(getOpcode())) {
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
|
|
Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
|
|
auto *Res =
|
|
Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
|
|
if (auto *I = dyn_cast<Instruction>(Res))
|
|
applyFlags(*I);
|
|
return Res;
|
|
}
|
|
|
|
switch (getOpcode()) {
|
|
case VPInstruction::Not: {
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
|
|
return Builder.CreateNot(A, Name);
|
|
}
|
|
case Instruction::ExtractElement: {
|
|
assert(State.VF.isVector() && "Only extract elements from vectors");
|
|
if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
|
|
return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
|
|
Value *Vec = State.get(getOperand(0));
|
|
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
|
|
return Builder.CreateExtractElement(Vec, Idx, Name);
|
|
}
|
|
case Instruction::Freeze: {
|
|
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
|
|
return Builder.CreateFreeze(Op, Name);
|
|
}
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp: {
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
|
|
Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
|
|
return Builder.CreateCmp(getPredicate(), A, B, Name);
|
|
}
|
|
case Instruction::PHI: {
|
|
llvm_unreachable("should be handled by VPPhi::execute");
|
|
}
|
|
case Instruction::Select: {
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
Value *Cond =
|
|
State.get(getOperand(0),
|
|
OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
|
|
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
|
|
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
|
|
return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
|
|
}
|
|
case VPInstruction::ActiveLaneMask: {
|
|
// Get first lane of vector induction variable.
|
|
Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
|
|
// Get the original loop tripcount.
|
|
Value *ScalarTC = State.get(getOperand(1), VPLane(0));
|
|
|
|
// If this part of the active lane mask is scalar, generate the CMP directly
|
|
// to avoid unnecessary extracts.
|
|
if (State.VF.isScalar())
|
|
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
|
|
Name);
|
|
|
|
ElementCount EC = State.VF.multiplyCoefficientBy(
|
|
cast<VPConstantInt>(getOperand(2))->getZExtValue());
|
|
auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
|
|
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
|
|
{PredTy, ScalarTC->getType()},
|
|
{VIVElem0, ScalarTC}, nullptr, Name);
|
|
}
|
|
case VPInstruction::FirstOrderRecurrenceSplice: {
|
|
// Generate code to combine the previous and current values in vector v3.
|
|
//
|
|
// vector.ph:
|
|
// v_init = vector(..., ..., ..., a[-1])
|
|
// br vector.body
|
|
//
|
|
// vector.body
|
|
// i = phi [0, vector.ph], [i+4, vector.body]
|
|
// v1 = phi [v_init, vector.ph], [v2, vector.body]
|
|
// v2 = a[i, i+1, i+2, i+3];
|
|
// v3 = vector(v1(3), v2(0, 1, 2))
|
|
|
|
auto *V1 = State.get(getOperand(0));
|
|
if (!V1->getType()->isVectorTy())
|
|
return V1;
|
|
Value *V2 = State.get(getOperand(1));
|
|
return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
|
|
}
|
|
case VPInstruction::CalculateTripCountMinusVF: {
|
|
Value *ScalarTC = State.get(getOperand(0), VPLane(0));
|
|
Value *VFxUF = State.get(getOperand(1), VPLane(0));
|
|
Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
|
|
Value *Cmp =
|
|
Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
|
|
Value *Zero = ConstantInt::getNullValue(ScalarTC->getType());
|
|
return Builder.CreateSelect(Cmp, Sub, Zero);
|
|
}
|
|
case VPInstruction::ExplicitVectorLength: {
|
|
// TODO: Restructure this code with an explicit remainder loop, vsetvli can
|
|
// be outside of the main loop.
|
|
Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
|
|
// Compute EVL
|
|
assert(AVL->getType()->isIntegerTy() &&
|
|
"Requested vector length should be an integer.");
|
|
|
|
assert(State.VF.isScalable() && "Expected scalable vector factor.");
|
|
Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
|
|
|
|
Value *EVL = Builder.CreateIntrinsic(
|
|
Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
|
|
{AVL, VFArg, Builder.getTrue()});
|
|
return EVL;
|
|
}
|
|
case VPInstruction::BranchOnCond: {
|
|
Value *Cond = State.get(getOperand(0), VPLane(0));
|
|
// Replace the temporary unreachable terminator with a new conditional
|
|
// branch, hooking it up to backward destination for latch blocks now, and
|
|
// to forward destination(s) later when they are created.
|
|
// Second successor may be backwards - iff it is already in VPBB2IRBB.
|
|
VPBasicBlock *SecondVPSucc =
|
|
cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
|
|
BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
|
|
BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
|
|
auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
|
|
// First successor is always forward, reset it to nullptr.
|
|
Br->setSuccessor(0, nullptr);
|
|
IRBB->getTerminator()->eraseFromParent();
|
|
applyMetadata(*Br);
|
|
return Br;
|
|
}
|
|
case VPInstruction::Broadcast: {
|
|
return Builder.CreateVectorSplat(
|
|
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
|
|
}
|
|
case VPInstruction::BuildStructVector: {
|
|
// For struct types, we need to build a new 'wide' struct type, where each
|
|
// element is widened, i.e., we create a struct of vectors.
|
|
auto *StructTy =
|
|
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
|
|
Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
|
|
for (const auto &[LaneIndex, Op] : enumerate(operands())) {
|
|
for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
|
|
FieldIndex++) {
|
|
Value *ScalarValue =
|
|
Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
|
|
Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
|
|
VectorValue =
|
|
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
|
|
Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
|
|
}
|
|
}
|
|
return Res;
|
|
}
|
|
case VPInstruction::BuildVector: {
|
|
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
|
|
auto NumOfElements = ElementCount::getFixed(getNumOperands());
|
|
Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
|
|
for (const auto &[Idx, Op] : enumerate(operands()))
|
|
Res = Builder.CreateInsertElement(Res, State.get(Op, true),
|
|
Builder.getInt32(Idx));
|
|
return Res;
|
|
}
|
|
case VPInstruction::ReductionStartVector: {
|
|
if (State.VF.isScalar())
|
|
return State.get(getOperand(0), true);
|
|
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
|
|
Builder.setFastMathFlags(getFastMathFlags());
|
|
// If this start vector is scaled then it should produce a vector with fewer
|
|
// elements than the VF.
|
|
ElementCount VF = State.VF.divideCoefficientBy(
|
|
cast<VPConstantInt>(getOperand(2))->getZExtValue());
|
|
auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
|
|
return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
|
|
Builder.getInt32(0));
|
|
}
|
|
case VPInstruction::ComputeReductionResult: {
|
|
RecurKind RK = getRecurKind();
|
|
bool IsOrdered = isReductionOrdered();
|
|
bool IsInLoop = isReductionInLoop();
|
|
assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
|
|
"FindIV should use min/max reduction kinds");
|
|
|
|
// The recipe may have multiple operands to be reduced together.
|
|
unsigned NumOperandsToReduce = getNumOperands();
|
|
VectorParts RdxParts(NumOperandsToReduce);
|
|
for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
|
|
RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
|
|
Builder.setFastMathFlags(getFastMathFlags());
|
|
|
|
// Reduce multiple operands into one.
|
|
Value *ReducedPartRdx = RdxParts[0];
|
|
if (IsOrdered) {
|
|
ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
|
|
} else {
|
|
// Floating-point operations should have some FMF to enable the reduction.
|
|
for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
|
|
Value *RdxPart = RdxParts[Part];
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
|
|
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
|
|
else {
|
|
// For sub-recurrences, each part's reduction variable is already
|
|
// negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
|
|
Instruction::BinaryOps Opcode =
|
|
RK == RecurKind::Sub
|
|
? Instruction::Add
|
|
: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
|
|
ReducedPartRdx =
|
|
Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create the reduction after the loop. Note that inloop reductions create
|
|
// the target reduction in the loop using a Reduction recipe.
|
|
if (State.VF.isVector() && !IsInLoop) {
|
|
// TODO: Support in-order reductions based on the recurrence descriptor.
|
|
// All ops in the reduction inherit fast-math-flags from the recurrence
|
|
// descriptor.
|
|
ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
|
|
}
|
|
|
|
return ReducedPartRdx;
|
|
}
|
|
case VPInstruction::ExtractLastLane:
|
|
case VPInstruction::ExtractPenultimateElement: {
|
|
unsigned Offset =
|
|
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
|
|
Value *Res;
|
|
if (State.VF.isVector()) {
|
|
assert(Offset <= State.VF.getKnownMinValue() &&
|
|
"invalid offset to extract from");
|
|
// Extract lane VF - Offset from the operand.
|
|
Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
|
|
} else {
|
|
// TODO: Remove ExtractLastLane for scalar VFs.
|
|
assert(Offset <= 1 && "invalid offset to extract from");
|
|
Res = State.get(getOperand(0));
|
|
}
|
|
if (isa<ExtractElementInst>(Res))
|
|
Res->setName(Name);
|
|
return Res;
|
|
}
|
|
case VPInstruction::LogicalAnd: {
|
|
Value *A = State.get(getOperand(0));
|
|
Value *B = State.get(getOperand(1));
|
|
return Builder.CreateLogicalAnd(A, B, Name);
|
|
}
|
|
case VPInstruction::LogicalOr: {
|
|
Value *A = State.get(getOperand(0));
|
|
Value *B = State.get(getOperand(1));
|
|
return Builder.CreateLogicalOr(A, B, Name);
|
|
}
|
|
case VPInstruction::PtrAdd: {
|
|
assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
|
|
"can only generate first lane for PtrAdd");
|
|
Value *Ptr = State.get(getOperand(0), VPLane(0));
|
|
Value *Addend = State.get(getOperand(1), VPLane(0));
|
|
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
|
|
}
|
|
case VPInstruction::WidePtrAdd: {
|
|
Value *Ptr =
|
|
State.get(getOperand(0), vputils::isSingleScalar(getOperand(0)));
|
|
Value *Addend = State.get(getOperand(1));
|
|
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
|
|
}
|
|
case VPInstruction::AnyOf: {
|
|
Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
|
|
for (VPValue *Op : drop_begin(operands()))
|
|
Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
|
|
return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
|
|
}
|
|
case VPInstruction::ExtractLane: {
|
|
assert(getNumOperands() != 2 && "ExtractLane from single source should be "
|
|
"simplified to ExtractElement.");
|
|
Value *LaneToExtract = State.get(getOperand(0), true);
|
|
Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
|
|
Value *Res = nullptr;
|
|
Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
|
|
|
|
for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
|
|
Value *VectorStart =
|
|
Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
|
|
Value *VectorIdx = Idx == 1
|
|
? LaneToExtract
|
|
: Builder.CreateSub(LaneToExtract, VectorStart);
|
|
Value *Ext = State.VF.isScalar()
|
|
? State.get(getOperand(Idx))
|
|
: Builder.CreateExtractElement(
|
|
State.get(getOperand(Idx)), VectorIdx);
|
|
if (Res) {
|
|
Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
|
|
Res = Builder.CreateSelect(Cmp, Ext, Res);
|
|
} else {
|
|
Res = Ext;
|
|
}
|
|
}
|
|
return Res;
|
|
}
|
|
case VPInstruction::FirstActiveLane: {
|
|
Type *Ty = State.TypeAnalysis.inferScalarType(this);
|
|
if (getNumOperands() == 1) {
|
|
Value *Mask = State.get(getOperand(0));
|
|
return Builder.CreateCountTrailingZeroElems(Ty, Mask,
|
|
/*ZeroIsPoison=*/false, Name);
|
|
}
|
|
// If there are multiple operands, create a chain of selects to pick the
|
|
// first operand with an active lane and add the number of lanes of the
|
|
// preceding operands.
|
|
Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
|
|
unsigned LastOpIdx = getNumOperands() - 1;
|
|
Value *Res = nullptr;
|
|
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
|
|
Value *TrailingZeros =
|
|
State.VF.isScalar()
|
|
? Builder.CreateZExt(
|
|
Builder.CreateICmpEQ(State.get(getOperand(Idx)),
|
|
Builder.getFalse()),
|
|
Ty)
|
|
: Builder.CreateCountTrailingZeroElems(
|
|
Ty, State.get(getOperand(Idx)),
|
|
/*ZeroIsPoison=*/false, Name);
|
|
Value *Current = Builder.CreateAdd(
|
|
Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
|
|
TrailingZeros);
|
|
if (Res) {
|
|
Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
|
|
Res = Builder.CreateSelect(Cmp, Current, Res);
|
|
} else {
|
|
Res = Current;
|
|
}
|
|
}
|
|
|
|
return Res;
|
|
}
|
|
case VPInstruction::ResumeForEpilogue:
|
|
return State.get(getOperand(0), true);
|
|
case VPInstruction::Reverse:
|
|
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
|
|
case VPInstruction::ExtractLastActive: {
|
|
Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
|
|
for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
|
|
Value *Data = State.get(getOperand(Idx));
|
|
Value *Mask = State.get(getOperand(Idx + 1));
|
|
Type *VTy = Data->getType();
|
|
|
|
if (State.VF.isScalar())
|
|
Result = Builder.CreateSelect(Mask, Data, Result);
|
|
else
|
|
Result = Builder.CreateIntrinsic(
|
|
Intrinsic::experimental_vector_extract_last_active, {VTy},
|
|
{Data, Mask, Result});
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
default:
|
|
llvm_unreachable("Unsupported opcode for instruction");
|
|
}
|
|
}
|
|
|
|
InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
|
|
unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(this);
|
|
Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
|
|
switch (Opcode) {
|
|
case Instruction::FNeg:
|
|
return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::SRem:
|
|
case Instruction::URem:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::FDiv:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor: {
|
|
// Certain instructions can be cheaper if they have a constant second
|
|
// operand. One example of this are shifts on x86.
|
|
VPValue *RHS = getOperand(1);
|
|
TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
|
|
|
|
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
|
|
getOperand(1)->isDefinedOutsideLoopRegions())
|
|
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
|
|
|
|
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
|
|
SmallVector<const Value *, 4> Operands;
|
|
if (CtxI)
|
|
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
|
|
return Ctx.TTI.getArithmeticInstrCost(
|
|
Opcode, ResultTy, Ctx.CostKind,
|
|
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
|
|
RHSInfo, Operands, CtxI, &Ctx.TLI);
|
|
}
|
|
case Instruction::Freeze:
|
|
// This opcode is unknown. Assume that it is the same as 'mul'.
|
|
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
|
|
Ctx.CostKind);
|
|
case Instruction::ExtractValue:
|
|
return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
|
|
Ctx.CostKind);
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp: {
|
|
Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
|
|
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
|
|
return Ctx.TTI.getCmpSelInstrCost(
|
|
Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
|
|
Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
|
|
{TTI::OK_AnyValue, TTI::OP_None}, CtxI);
|
|
}
|
|
case Instruction::BitCast: {
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(this);
|
|
if (ScalarTy->isPointerTy())
|
|
return 0;
|
|
[[fallthrough]];
|
|
}
|
|
case Instruction::SExt:
|
|
case Instruction::ZExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::PtrToAddr:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::AddrSpaceCast: {
|
|
// Computes the CastContextHint from a recipe that may access memory.
|
|
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
|
|
if (isa<VPInterleaveBase>(R))
|
|
return TTI::CastContextHint::Interleave;
|
|
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
|
|
// Only compute CCH for memory operations, matching the legacy model
|
|
// which only considers loads/stores for cast context hints.
|
|
auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
|
|
if (!isa<LoadInst, StoreInst>(UI))
|
|
return TTI::CastContextHint::None;
|
|
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
|
|
: TTI::CastContextHint::Normal;
|
|
}
|
|
const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
|
|
if (WidenMemoryRecipe == nullptr)
|
|
return TTI::CastContextHint::None;
|
|
if (VF.isScalar())
|
|
return TTI::CastContextHint::Normal;
|
|
if (!WidenMemoryRecipe->isConsecutive())
|
|
return TTI::CastContextHint::GatherScatter;
|
|
if (WidenMemoryRecipe->isMasked())
|
|
return TTI::CastContextHint::Masked;
|
|
return TTI::CastContextHint::Normal;
|
|
};
|
|
|
|
VPValue *Operand = getOperand(0);
|
|
TTI::CastContextHint CCH = TTI::CastContextHint::None;
|
|
bool IsReverse = false;
|
|
// For Trunc/FPTrunc, get the context from the only user.
|
|
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
|
|
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
|
|
if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
|
|
return nullptr;
|
|
return dyn_cast<VPRecipeBase>(*R->user_begin());
|
|
};
|
|
if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
|
|
if (match(Recipe,
|
|
m_CombineOr(
|
|
m_Reverse(m_VPValue()),
|
|
m_Intrinsic<Intrinsic::experimental_vp_reverse>()))) {
|
|
Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
|
|
IsReverse = true;
|
|
}
|
|
if (Recipe)
|
|
CCH = ComputeCCH(Recipe);
|
|
}
|
|
}
|
|
// For Z/Sext, get the context from the operand.
|
|
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
|
|
Opcode == Instruction::FPExt) {
|
|
if (auto *Recipe = Operand->getDefiningRecipe()) {
|
|
VPValue *ReverseOp;
|
|
if (match(Recipe,
|
|
m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
|
|
m_Intrinsic<Intrinsic::experimental_vp_reverse>(
|
|
m_VPValue(ReverseOp))))) {
|
|
Recipe = ReverseOp->getDefiningRecipe();
|
|
IsReverse = true;
|
|
}
|
|
if (Recipe)
|
|
CCH = ComputeCCH(Recipe);
|
|
}
|
|
}
|
|
if (IsReverse && CCH != TTI::CastContextHint::None)
|
|
CCH = TTI::CastContextHint::Reversed;
|
|
|
|
auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
|
|
Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
|
|
// Arm TTI will use the underlying instruction to determine the cost.
|
|
return Ctx.TTI.getCastInstrCost(
|
|
Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
|
|
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
|
|
}
|
|
case Instruction::Select: {
|
|
SelectInst *SI = cast_or_null<SelectInst>(getUnderlyingValue());
|
|
bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(this);
|
|
|
|
VPValue *Op0, *Op1;
|
|
bool IsLogicalAnd =
|
|
match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
|
|
bool IsLogicalOr =
|
|
match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
|
|
// Also match the inverted forms:
|
|
// select x, false, y --> !x & y (still AND)
|
|
// select x, y, true --> !x | y (still OR)
|
|
IsLogicalAnd |=
|
|
match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
|
|
IsLogicalOr |=
|
|
match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
|
|
|
|
if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
|
|
(IsLogicalAnd || IsLogicalOr)) {
|
|
// select x, y, false --> x & y
|
|
// select x, true, y --> x | y
|
|
const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
|
|
const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
|
|
|
|
SmallVector<const Value *, 2> Operands;
|
|
if (SI && all_of(operands(),
|
|
[](VPValue *Op) { return Op->getUnderlyingValue(); }))
|
|
append_range(Operands, SI->operands());
|
|
return Ctx.TTI.getArithmeticInstrCost(
|
|
IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
|
|
Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
|
|
}
|
|
|
|
Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
if (!IsScalarCond && VF.isVector())
|
|
CondTy = VectorType::get(CondTy, VF);
|
|
|
|
llvm::CmpPredicate Pred;
|
|
if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
|
|
if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
|
|
if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
|
|
Pred = Cmp->getPredicate();
|
|
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
|
|
return Ctx.TTI.getCmpSelInstrCost(
|
|
Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
|
|
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
|
|
}
|
|
}
|
|
llvm_unreachable("called for unsupported opcode");
|
|
}
|
|
|
|
InstructionCost VPInstruction::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
if (Instruction::isBinaryOp(getOpcode())) {
|
|
if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
|
|
// TODO: Compute cost for VPInstructions without underlying values once
|
|
// the legacy cost model has been retired.
|
|
return 0;
|
|
}
|
|
|
|
assert(!doesGeneratePerAllLanes() &&
|
|
"Should only generate a vector value or single scalar, not scalars "
|
|
"for all lanes.");
|
|
return getCostForRecipeWithOpcode(
|
|
getOpcode(),
|
|
vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
|
|
}
|
|
|
|
switch (getOpcode()) {
|
|
case Instruction::Select: {
|
|
llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
|
|
match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
|
|
auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
|
|
if (!vputils::onlyFirstLaneUsed(this)) {
|
|
CondTy = toVectorTy(CondTy, VF);
|
|
VecTy = toVectorTy(VecTy, VF);
|
|
}
|
|
return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
|
|
Ctx.CostKind);
|
|
}
|
|
case Instruction::ExtractElement:
|
|
case VPInstruction::ExtractLane: {
|
|
if (VF.isScalar()) {
|
|
// ExtractLane with VF=1 takes care of handling extracting across multiple
|
|
// parts.
|
|
return 0;
|
|
}
|
|
|
|
// Add on the cost of extracting the element.
|
|
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
|
|
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
|
|
Ctx.CostKind);
|
|
}
|
|
case VPInstruction::AnyOf: {
|
|
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
|
|
return Ctx.TTI.getArithmeticReductionCost(
|
|
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
|
|
}
|
|
case VPInstruction::FirstActiveLane: {
|
|
Type *Ty = Ctx.Types.inferScalarType(this);
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
if (VF.isScalar())
|
|
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
|
|
CmpInst::makeCmpResultType(ScalarTy),
|
|
CmpInst::ICMP_EQ, Ctx.CostKind);
|
|
// Calculate the cost of determining the lane index.
|
|
auto *PredTy = toVectorTy(ScalarTy, VF);
|
|
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
|
|
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
|
|
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
|
|
}
|
|
case VPInstruction::LastActiveLane: {
|
|
Type *Ty = Ctx.Types.inferScalarType(this);
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
if (VF.isScalar())
|
|
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
|
|
CmpInst::makeCmpResultType(ScalarTy),
|
|
CmpInst::ICMP_EQ, Ctx.CostKind);
|
|
// Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
|
|
auto *PredTy = toVectorTy(ScalarTy, VF);
|
|
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
|
|
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
|
|
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
|
|
// Add cost of NOT operation on the predicate.
|
|
Cost += Ctx.TTI.getArithmeticInstrCost(
|
|
Instruction::Xor, PredTy, Ctx.CostKind,
|
|
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
|
|
{TargetTransformInfo::OK_UniformConstantValue,
|
|
TargetTransformInfo::OP_None});
|
|
// Add cost of SUB operation on the index.
|
|
Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
|
|
return Cost;
|
|
}
|
|
case VPInstruction::ExtractLastActive: {
|
|
Type *ScalarTy = Ctx.Types.inferScalarType(this);
|
|
Type *VecTy = toVectorTy(ScalarTy, VF);
|
|
Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
|
|
IntrinsicCostAttributes ICA(
|
|
Intrinsic::experimental_vector_extract_last_active, ScalarTy,
|
|
{VecTy, MaskTy, ScalarTy});
|
|
return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
|
|
}
|
|
case VPInstruction::FirstOrderRecurrenceSplice: {
|
|
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
|
|
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
|
|
return Ctx.TTI.getShuffleCost(
|
|
TargetTransformInfo::SK_Splice, cast<VectorType>(VectorTy),
|
|
cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
|
|
}
|
|
case VPInstruction::ActiveLaneMask: {
|
|
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
|
|
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
|
|
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
|
|
{ArgTy, ArgTy});
|
|
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
|
|
}
|
|
case VPInstruction::ExplicitVectorLength: {
|
|
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
|
|
Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
|
|
Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
|
|
IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
|
|
I32Ty, {Arg0Ty, I32Ty, I1Ty});
|
|
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
|
|
}
|
|
case VPInstruction::Reverse: {
|
|
assert(VF.isVector() && "Reverse operation must be vector type");
|
|
Type *EltTy = Ctx.Types.inferScalarType(this);
|
|
// Skip the reverse operation cost for the mask.
|
|
// FIXME: Remove this once redundant mask reverse operations can be
|
|
// eliminated by VPlanTransforms::cse before cost computation.
|
|
if (EltTy->isIntegerTy(1))
|
|
return 0;
|
|
auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
|
|
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
|
|
VectorTy, /*Mask=*/{}, Ctx.CostKind,
|
|
/*Index=*/0);
|
|
}
|
|
case VPInstruction::ExtractLastLane: {
|
|
// Add on the cost of extracting the element.
|
|
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
|
|
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
|
|
VecTy, Ctx.CostKind, 0);
|
|
}
|
|
case VPInstruction::ExtractPenultimateElement:
|
|
if (VF == ElementCount::getScalable(1))
|
|
return InstructionCost::getInvalid();
|
|
[[fallthrough]];
|
|
default:
|
|
// TODO: Compute cost other VPInstructions once the legacy cost model has
|
|
// been retired.
|
|
assert(!getUnderlyingValue() &&
|
|
"unexpected VPInstruction witht underlying value");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
bool VPInstruction::isVectorToScalar() const {
|
|
return getOpcode() == VPInstruction::ExtractLastLane ||
|
|
getOpcode() == VPInstruction::ExtractPenultimateElement ||
|
|
getOpcode() == Instruction::ExtractElement ||
|
|
getOpcode() == VPInstruction::ExtractLane ||
|
|
getOpcode() == VPInstruction::FirstActiveLane ||
|
|
getOpcode() == VPInstruction::LastActiveLane ||
|
|
getOpcode() == VPInstruction::ExtractLastActive ||
|
|
getOpcode() == VPInstruction::ComputeReductionResult ||
|
|
getOpcode() == VPInstruction::AnyOf;
|
|
}
|
|
|
|
bool VPInstruction::isSingleScalar() const {
|
|
switch (getOpcode()) {
|
|
case Instruction::Load:
|
|
case Instruction::PHI:
|
|
case VPInstruction::ExplicitVectorLength:
|
|
case VPInstruction::ResumeForEpilogue:
|
|
case VPInstruction::VScale:
|
|
return true;
|
|
default:
|
|
return isScalarCast();
|
|
}
|
|
}
|
|
|
|
void VPInstruction::execute(VPTransformState &State) {
|
|
assert(!isMasked() && "cannot execute masked VPInstruction");
|
|
assert(!State.Lane && "VPInstruction executing an Lane");
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
|
|
assert(flagsValidForOpcode(getOpcode()) &&
|
|
"Set flags not supported for the provided opcode");
|
|
assert(hasRequiredFlagsForOpcode(getOpcode()) &&
|
|
"Opcode requires specific flags to be set");
|
|
if (hasFastMathFlags())
|
|
State.Builder.setFastMathFlags(getFastMathFlags());
|
|
Value *GeneratedValue = generate(State);
|
|
if (!hasResult())
|
|
return;
|
|
assert(GeneratedValue && "generate must produce a value");
|
|
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
|
|
(vputils::onlyFirstLaneUsed(this) ||
|
|
isVectorToScalar() || isSingleScalar());
|
|
assert((((GeneratedValue->getType()->isVectorTy() ||
|
|
GeneratedValue->getType()->isStructTy()) ==
|
|
!GeneratesPerFirstLaneOnly) ||
|
|
State.VF.isScalar()) &&
|
|
"scalar value but not only first lane defined");
|
|
State.set(this, GeneratedValue,
|
|
/*IsScalar*/ GeneratesPerFirstLaneOnly);
|
|
if (getOpcode() == VPInstruction::ResumeForEpilogue) {
|
|
// FIXME: This is a workaround to enable reliable updates of the scalar loop
|
|
// resume phis, when vectorizing the epilogue. Must be removed once epilogue
|
|
// vectorization explicitly connects VPlans.
|
|
setUnderlyingValue(GeneratedValue);
|
|
}
|
|
}
|
|
|
|
bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
|
|
if (Instruction::isBinaryOp(getOpcode()) ||
|
|
Instruction::isUnaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
|
|
return false;
|
|
switch (getOpcode()) {
|
|
case Instruction::ExtractValue:
|
|
case Instruction::InsertValue:
|
|
case Instruction::GetElementPtr:
|
|
case Instruction::ExtractElement:
|
|
case Instruction::Freeze:
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp:
|
|
case Instruction::Select:
|
|
case Instruction::PHI:
|
|
case VPInstruction::AnyOf:
|
|
case VPInstruction::BranchOnCond:
|
|
case VPInstruction::BranchOnTwoConds:
|
|
case VPInstruction::BranchOnCount:
|
|
case VPInstruction::Broadcast:
|
|
case VPInstruction::BuildStructVector:
|
|
case VPInstruction::BuildVector:
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
case VPInstruction::ExtractLane:
|
|
case VPInstruction::ExtractLastLane:
|
|
case VPInstruction::ExtractLastPart:
|
|
case VPInstruction::ExtractPenultimateElement:
|
|
case VPInstruction::ActiveLaneMask:
|
|
case VPInstruction::ExitingIVValue:
|
|
case VPInstruction::ExplicitVectorLength:
|
|
case VPInstruction::FirstActiveLane:
|
|
case VPInstruction::LastActiveLane:
|
|
case VPInstruction::ExtractLastActive:
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
|
case VPInstruction::LogicalAnd:
|
|
case VPInstruction::LogicalOr:
|
|
case VPInstruction::MaskedCond:
|
|
case VPInstruction::Not:
|
|
case VPInstruction::PtrAdd:
|
|
case VPInstruction::WideIVStep:
|
|
case VPInstruction::WidePtrAdd:
|
|
case VPInstruction::StepVector:
|
|
case VPInstruction::ReductionStartVector:
|
|
case VPInstruction::Reverse:
|
|
case VPInstruction::VScale:
|
|
case VPInstruction::Unpack:
|
|
return false;
|
|
case Instruction::Call:
|
|
return !getCalledFunction(*this)->doesNotAccessMemory();
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
|
|
return vputils::onlyFirstLaneUsed(this);
|
|
|
|
switch (getOpcode()) {
|
|
default:
|
|
return false;
|
|
case Instruction::ExtractElement:
|
|
return Op == getOperand(1);
|
|
case Instruction::PHI:
|
|
return true;
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp:
|
|
case Instruction::Select:
|
|
case Instruction::Or:
|
|
case Instruction::Freeze:
|
|
case VPInstruction::Not:
|
|
// TODO: Cover additional opcodes.
|
|
return vputils::onlyFirstLaneUsed(this);
|
|
case Instruction::Load:
|
|
case VPInstruction::ActiveLaneMask:
|
|
case VPInstruction::ExplicitVectorLength:
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
case VPInstruction::BranchOnCount:
|
|
case VPInstruction::BranchOnCond:
|
|
case VPInstruction::Broadcast:
|
|
case VPInstruction::ReductionStartVector:
|
|
return true;
|
|
case VPInstruction::BuildStructVector:
|
|
case VPInstruction::BuildVector:
|
|
// Before replicating by VF, Build(Struct)Vector uses all lanes of the
|
|
// operand, after replicating its operands only the first lane is used.
|
|
// Before replicating, it will have only a single operand.
|
|
return getNumOperands() > 1;
|
|
case VPInstruction::PtrAdd:
|
|
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
|
|
case VPInstruction::WidePtrAdd:
|
|
// WidePtrAdd supports scalar and vector base addresses.
|
|
return false;
|
|
case VPInstruction::ExitingIVValue:
|
|
case VPInstruction::ExtractLane:
|
|
return Op == getOperand(0);
|
|
};
|
|
llvm_unreachable("switch should return");
|
|
}
|
|
|
|
bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
if (Instruction::isBinaryOp(getOpcode()))
|
|
return vputils::onlyFirstPartUsed(this);
|
|
|
|
switch (getOpcode()) {
|
|
default:
|
|
return false;
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp:
|
|
case Instruction::Select:
|
|
return vputils::onlyFirstPartUsed(this);
|
|
case VPInstruction::BranchOnCount:
|
|
case VPInstruction::BranchOnCond:
|
|
case VPInstruction::BranchOnTwoConds:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
return true;
|
|
};
|
|
llvm_unreachable("switch should return");
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPInstruction::dump() const {
|
|
VPSlotTracker SlotTracker(getParent()->getPlan());
|
|
printRecipe(dbgs(), "", SlotTracker);
|
|
}
|
|
|
|
void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
|
|
|
|
if (hasResult()) {
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
}
|
|
|
|
switch (getOpcode()) {
|
|
case VPInstruction::Not:
|
|
O << "not";
|
|
break;
|
|
case VPInstruction::ActiveLaneMask:
|
|
O << "active lane mask";
|
|
break;
|
|
case VPInstruction::ExplicitVectorLength:
|
|
O << "EXPLICIT-VECTOR-LENGTH";
|
|
break;
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
|
O << "first-order splice";
|
|
break;
|
|
case VPInstruction::BranchOnCond:
|
|
O << "branch-on-cond";
|
|
break;
|
|
case VPInstruction::BranchOnTwoConds:
|
|
O << "branch-on-two-conds";
|
|
break;
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
O << "TC > VF ? TC - VF : 0";
|
|
break;
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
O << "VF * Part +";
|
|
break;
|
|
case VPInstruction::BranchOnCount:
|
|
O << "branch-on-count";
|
|
break;
|
|
case VPInstruction::Broadcast:
|
|
O << "broadcast";
|
|
break;
|
|
case VPInstruction::BuildStructVector:
|
|
O << "buildstructvector";
|
|
break;
|
|
case VPInstruction::BuildVector:
|
|
O << "buildvector";
|
|
break;
|
|
case VPInstruction::ExitingIVValue:
|
|
O << "exiting-iv-value";
|
|
break;
|
|
case VPInstruction::MaskedCond:
|
|
O << "masked-cond";
|
|
break;
|
|
case VPInstruction::ExtractLane:
|
|
O << "extract-lane";
|
|
break;
|
|
case VPInstruction::ExtractLastLane:
|
|
O << "extract-last-lane";
|
|
break;
|
|
case VPInstruction::ExtractLastPart:
|
|
O << "extract-last-part";
|
|
break;
|
|
case VPInstruction::ExtractPenultimateElement:
|
|
O << "extract-penultimate-element";
|
|
break;
|
|
case VPInstruction::ComputeReductionResult:
|
|
O << "compute-reduction-result";
|
|
break;
|
|
case VPInstruction::LogicalAnd:
|
|
O << "logical-and";
|
|
break;
|
|
case VPInstruction::LogicalOr:
|
|
O << "logical-or";
|
|
break;
|
|
case VPInstruction::PtrAdd:
|
|
O << "ptradd";
|
|
break;
|
|
case VPInstruction::WidePtrAdd:
|
|
O << "wide-ptradd";
|
|
break;
|
|
case VPInstruction::AnyOf:
|
|
O << "any-of";
|
|
break;
|
|
case VPInstruction::FirstActiveLane:
|
|
O << "first-active-lane";
|
|
break;
|
|
case VPInstruction::LastActiveLane:
|
|
O << "last-active-lane";
|
|
break;
|
|
case VPInstruction::ReductionStartVector:
|
|
O << "reduction-start-vector";
|
|
break;
|
|
case VPInstruction::ResumeForEpilogue:
|
|
O << "resume-for-epilogue";
|
|
break;
|
|
case VPInstruction::Reverse:
|
|
O << "reverse";
|
|
break;
|
|
case VPInstruction::Unpack:
|
|
O << "unpack";
|
|
break;
|
|
case VPInstruction::ExtractLastActive:
|
|
O << "extract-last-active";
|
|
break;
|
|
default:
|
|
O << Instruction::getOpcodeName(getOpcode());
|
|
}
|
|
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPInstructionWithType::execute(VPTransformState &State) {
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
if (isScalarCast()) {
|
|
Value *Op = State.get(getOperand(0), VPLane(0));
|
|
Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
|
|
Op, ResultTy);
|
|
State.set(this, Cast, VPLane(0));
|
|
return;
|
|
}
|
|
switch (getOpcode()) {
|
|
case VPInstruction::StepVector: {
|
|
Value *StepVector =
|
|
State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
|
|
State.set(this, StepVector);
|
|
break;
|
|
}
|
|
case VPInstruction::VScale: {
|
|
Value *VScale = State.Builder.CreateVScale(ResultTy);
|
|
State.set(this, VScale, true);
|
|
break;
|
|
}
|
|
|
|
default:
|
|
llvm_unreachable("opcode not implemented yet");
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
|
|
switch (getOpcode()) {
|
|
case VPInstruction::WideIVStep:
|
|
O << "wide-iv-step ";
|
|
printOperands(O, SlotTracker);
|
|
break;
|
|
case VPInstruction::StepVector:
|
|
O << "step-vector " << *ResultTy;
|
|
break;
|
|
case VPInstruction::VScale:
|
|
O << "vscale " << *ResultTy;
|
|
break;
|
|
case Instruction::Load:
|
|
O << "load ";
|
|
printOperands(O, SlotTracker);
|
|
break;
|
|
default:
|
|
assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
|
|
O << Instruction::getOpcodeName(getOpcode()) << " ";
|
|
printOperands(O, SlotTracker);
|
|
O << " to " << *ResultTy;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void VPPhi::execute(VPTransformState &State) {
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
PHINode *NewPhi = State.Builder.CreatePHI(
|
|
State.TypeAnalysis.inferScalarType(this), 2, getName());
|
|
unsigned NumIncoming = getNumIncoming();
|
|
// Detect header phis: the parent block dominates its second incoming block
|
|
// (the latch). Those IR incoming values have not been generated yet and need
|
|
// to be added after they have been executed.
|
|
if (NumIncoming == 2 &&
|
|
State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
|
|
NumIncoming = 1;
|
|
}
|
|
for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
|
|
Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
|
|
BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
|
|
NewPhi->addIncoming(IncV, PredBB);
|
|
}
|
|
State.set(this, NewPhi, VPLane(0));
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi";
|
|
printFlags(O);
|
|
printPhiOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
|
|
if (auto *Phi = dyn_cast<PHINode>(&I))
|
|
return new VPIRPhi(*Phi);
|
|
return new VPIRInstruction(I);
|
|
}
|
|
|
|
void VPIRInstruction::execute(VPTransformState &State) {
|
|
assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
|
|
"PHINodes must be handled by VPIRPhi");
|
|
// Advance the insert point after the wrapped IR instruction. This allows
|
|
// interleaving VPIRInstructions and other recipes.
|
|
State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
|
|
}
|
|
|
|
InstructionCost VPIRInstruction::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
// The recipe wraps an existing IR instruction on the border of VPlan's scope,
|
|
// hence it does not contribute to the cost-modeling for the VPlan.
|
|
return 0;
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "IR " << I;
|
|
}
|
|
#endif
|
|
|
|
void VPIRPhi::execute(VPTransformState &State) {
|
|
PHINode *Phi = &getIRPhi();
|
|
for (const auto &[Idx, Op] : enumerate(operands())) {
|
|
VPValue *ExitValue = Op;
|
|
auto Lane = vputils::isSingleScalar(ExitValue)
|
|
? VPLane::getFirstLane()
|
|
: VPLane::getLastLaneForVF(State.VF);
|
|
VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
|
|
auto *PredVPBB = Pred->getExitingBasicBlock();
|
|
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
|
|
// Set insertion point in PredBB in case an extract needs to be generated.
|
|
// TODO: Model extracts explicitly.
|
|
State.Builder.SetInsertPoint(PredBB->getTerminator());
|
|
Value *V = State.get(ExitValue, VPLane(Lane));
|
|
// If there is no existing block for PredBB in the phi, add a new incoming
|
|
// value. Otherwise update the existing incoming value for PredBB.
|
|
if (Phi->getBasicBlockIndex(PredBB) == -1)
|
|
Phi->addIncoming(V, PredBB);
|
|
else
|
|
Phi->setIncomingValueForBlock(PredBB, V);
|
|
}
|
|
|
|
// Advance the insert point after the wrapped IR instruction. This allows
|
|
// interleaving VPIRInstructions and other recipes.
|
|
State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
|
|
}
|
|
|
|
void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
|
|
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
|
|
assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
|
|
"Number of phi operands must match number of predecessors");
|
|
unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
|
|
R->removeOperand(Position);
|
|
}
|
|
|
|
VPValue *
|
|
VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
|
|
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
|
|
return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
|
|
}
|
|
|
|
void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
|
|
VPValue *V) const {
|
|
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
|
|
R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPPhiAccessors::printPhiOperands(raw_ostream &O,
|
|
VPSlotTracker &SlotTracker) const {
|
|
interleaveComma(enumerate(getAsRecipe()->operands()), O,
|
|
[this, &O, &SlotTracker](auto Op) {
|
|
O << "[ ";
|
|
Op.value()->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getIncomingBlock(Op.index())->printAsOperand(O);
|
|
O << " ]";
|
|
});
|
|
}
|
|
#endif
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
VPIRInstruction::printRecipe(O, Indent, SlotTracker);
|
|
|
|
if (getNumOperands() != 0) {
|
|
O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
|
|
interleaveComma(incoming_values_and_blocks(), O,
|
|
[&O, &SlotTracker](auto Op) {
|
|
std::get<0>(Op)->printAsOperand(O, SlotTracker);
|
|
O << " from ";
|
|
std::get<1>(Op)->printAsOperand(O);
|
|
});
|
|
O << ")";
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void VPIRMetadata::applyMetadata(Instruction &I) const {
|
|
for (const auto &[Kind, Node] : Metadata)
|
|
I.setMetadata(Kind, Node);
|
|
}
|
|
|
|
void VPIRMetadata::intersect(const VPIRMetadata &Other) {
|
|
SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
|
|
for (const auto &[KindA, MDA] : Metadata) {
|
|
for (const auto &[KindB, MDB] : Other.Metadata) {
|
|
if (KindA == KindB && MDA == MDB) {
|
|
MetadataIntersection.emplace_back(KindA, MDA);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
Metadata = std::move(MetadataIntersection);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
|
|
const Module *M = SlotTracker.getModule();
|
|
if (Metadata.empty() || !M)
|
|
return;
|
|
|
|
ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
|
|
O << " (";
|
|
interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
|
|
auto [Kind, Node] = KindNodePair;
|
|
assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
|
|
"Unexpected unnamed metadata kind");
|
|
O << "!" << MDNames[Kind] << " ";
|
|
Node->printAsOperand(O, M);
|
|
});
|
|
O << ")";
|
|
}
|
|
#endif
|
|
|
|
void VPWidenCallRecipe::execute(VPTransformState &State) {
|
|
assert(State.VF.isVector() && "not widening");
|
|
assert(Variant != nullptr && "Can't create vector function.");
|
|
|
|
FunctionType *VFTy = Variant->getFunctionType();
|
|
// Add return type if intrinsic is overloaded on it.
|
|
SmallVector<Value *, 4> Args;
|
|
for (const auto &I : enumerate(args())) {
|
|
Value *Arg;
|
|
// Some vectorized function variants may also take a scalar argument,
|
|
// e.g. linear parameters for pointers. This needs to be the scalar value
|
|
// from the start of the respective part when interleaving.
|
|
if (!VFTy->getParamType(I.index())->isVectorTy())
|
|
Arg = State.get(I.value(), VPLane(0));
|
|
else
|
|
Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
|
|
Args.push_back(Arg);
|
|
}
|
|
|
|
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
|
if (CI)
|
|
CI->getOperandBundlesAsDefs(OpBundles);
|
|
|
|
CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
|
|
applyFlags(*V);
|
|
applyMetadata(*V);
|
|
V->setCallingConv(Variant->getCallingConv());
|
|
|
|
if (!V->getType()->isVoidTy())
|
|
State.set(this, V);
|
|
}
|
|
|
|
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
|
|
Variant->getFunctionType()->params(),
|
|
Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-CALL ";
|
|
|
|
Function *CalledFn = getCalledScalarFunction();
|
|
if (CalledFn->getReturnType()->isVoidTy())
|
|
O << "void ";
|
|
else {
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
}
|
|
|
|
O << "call";
|
|
printFlags(O);
|
|
O << " @" << CalledFn->getName() << "(";
|
|
interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
|
|
Op->printAsOperand(O, SlotTracker);
|
|
});
|
|
O << ")";
|
|
|
|
O << " (using library function";
|
|
if (Variant->hasName())
|
|
O << ": " << Variant->getName();
|
|
O << ")";
|
|
}
|
|
#endif
|
|
|
|
void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
|
|
assert(State.VF.isVector() && "not widening");
|
|
|
|
SmallVector<Type *, 2> TysForDecl;
|
|
// Add return type if intrinsic is overloaded on it.
|
|
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
|
|
State.TTI)) {
|
|
Type *RetTy = toVectorizedTy(getResultType(), State.VF);
|
|
ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
|
|
for (auto [Idx, Ty] : enumerate(ContainedTys)) {
|
|
if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID,
|
|
Idx, State.TTI))
|
|
TysForDecl.push_back(Ty);
|
|
}
|
|
}
|
|
SmallVector<Value *, 4> Args;
|
|
for (const auto &I : enumerate(operands())) {
|
|
// Some intrinsics have a scalar argument - don't replace it with a
|
|
// vector.
|
|
Value *Arg;
|
|
if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
|
|
State.TTI))
|
|
Arg = State.get(I.value(), VPLane(0));
|
|
else
|
|
Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
|
|
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
|
|
State.TTI))
|
|
TysForDecl.push_back(Arg->getType());
|
|
Args.push_back(Arg);
|
|
}
|
|
|
|
// Use vector version of the intrinsic.
|
|
Module *M = State.Builder.GetInsertBlock()->getModule();
|
|
Function *VectorF =
|
|
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
|
|
assert(VectorF &&
|
|
"Can't retrieve vector intrinsic or vector-predication intrinsics.");
|
|
|
|
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
|
if (CI)
|
|
CI->getOperandBundlesAsDefs(OpBundles);
|
|
|
|
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
|
|
|
|
applyFlags(*V);
|
|
applyMetadata(*V);
|
|
|
|
if (!V->getType()->isVoidTy())
|
|
State.set(this, V);
|
|
}
|
|
|
|
/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
|
|
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
|
|
ArrayRef<const VPValue *> Operands,
|
|
const VPRecipeWithIRFlags &R,
|
|
ElementCount VF,
|
|
VPCostContext &Ctx) {
|
|
Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
|
|
// Skip the reverse operation cost for the mask.
|
|
// FIXME: Remove this once redundant mask reverse operations can be eliminated
|
|
// by VPlanTransforms::cse before cost computation.
|
|
if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
|
|
return InstructionCost(0);
|
|
|
|
// Some backends analyze intrinsic arguments to determine cost. Use the
|
|
// underlying value for the operand if it has one. Otherwise try to use the
|
|
// operand of the underlying call instruction, if there is one. Otherwise
|
|
// clear Arguments.
|
|
// TODO: Rework TTI interface to be independent of concrete IR values.
|
|
SmallVector<const Value *> Arguments;
|
|
for (const auto &[Idx, Op] : enumerate(Operands)) {
|
|
auto *V = Op->getUnderlyingValue();
|
|
if (!V) {
|
|
if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
|
|
Arguments.push_back(UI->getArgOperand(Idx));
|
|
continue;
|
|
}
|
|
Arguments.clear();
|
|
break;
|
|
}
|
|
Arguments.push_back(V);
|
|
}
|
|
|
|
Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
|
|
SmallVector<Type *> ParamTys;
|
|
for (const VPValue *Op : Operands) {
|
|
ParamTys.push_back(VF.isVector()
|
|
? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
|
|
: Ctx.Types.inferScalarType(Op));
|
|
}
|
|
|
|
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
|
|
IntrinsicCostAttributes CostAttrs(
|
|
ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
|
|
dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
|
|
InstructionCost::getInvalid());
|
|
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
|
|
}
|
|
|
|
InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
SmallVector<const VPValue *> ArgOps(operands());
|
|
return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
|
|
}
|
|
|
|
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
|
|
return Intrinsic::getBaseName(VectorIntrinsicID);
|
|
}
|
|
|
|
bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
return all_of(enumerate(operands()), [this, &Op](const auto &X) {
|
|
auto [Idx, V] = X;
|
|
return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
|
|
Idx, nullptr);
|
|
});
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-INTRINSIC ";
|
|
if (ResultTy->isVoidTy()) {
|
|
O << "void ";
|
|
} else {
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
}
|
|
|
|
O << "call";
|
|
printFlags(O);
|
|
O << getIntrinsicName() << "(";
|
|
|
|
interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
|
|
Op->printAsOperand(O, SlotTracker);
|
|
});
|
|
O << ")";
|
|
}
|
|
#endif
|
|
|
|
void VPHistogramRecipe::execute(VPTransformState &State) {
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
Value *Address = State.get(getOperand(0));
|
|
Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
|
|
VectorType *VTy = cast<VectorType>(Address->getType());
|
|
|
|
// The histogram intrinsic requires a mask even if the recipe doesn't;
|
|
// if the mask operand was omitted then all lanes should be executed and
|
|
// we just need to synthesize an all-true mask.
|
|
Value *Mask = nullptr;
|
|
if (VPValue *VPMask = getMask())
|
|
Mask = State.get(VPMask);
|
|
else
|
|
Mask =
|
|
Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
|
|
|
|
// If this is a subtract, we want to invert the increment amount. We may
|
|
// add a separate intrinsic in future, but for now we'll try this.
|
|
if (Opcode == Instruction::Sub)
|
|
IncAmt = Builder.CreateNeg(IncAmt);
|
|
else
|
|
assert(Opcode == Instruction::Add && "only add or sub supported for now");
|
|
|
|
State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
|
|
{VTy, IncAmt->getType()},
|
|
{Address, IncAmt, Mask});
|
|
}
|
|
|
|
InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
// FIXME: Take the gather and scatter into account as well. For now we're
|
|
// generating the same cost as the fallback path, but we'll likely
|
|
// need to create a new TTI method for determining the cost, including
|
|
// whether we can use base + vec-of-smaller-indices or just
|
|
// vec-of-pointers.
|
|
assert(VF.isVector() && "Invalid VF for histogram cost");
|
|
Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
|
|
VPValue *IncAmt = getOperand(1);
|
|
Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
|
|
VectorType *VTy = VectorType::get(IncTy, VF);
|
|
|
|
// Assume that a non-constant update value (or a constant != 1) requires
|
|
// a multiply, and add that into the cost.
|
|
InstructionCost MulCost =
|
|
Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
|
|
if (match(IncAmt, m_One()))
|
|
MulCost = TTI::TCC_Free;
|
|
|
|
// Find the cost of the histogram operation itself.
|
|
Type *PtrTy = VectorType::get(AddressTy, VF);
|
|
Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
|
|
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
|
|
Type::getVoidTy(Ctx.LLVMCtx),
|
|
{PtrTy, IncTy, MaskTy});
|
|
|
|
// Add the costs together with the add/sub operation.
|
|
return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
|
|
Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-HISTOGRAM buckets: ";
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
|
|
if (Opcode == Instruction::Sub)
|
|
O << ", dec: ";
|
|
else {
|
|
assert(Opcode == Instruction::Add);
|
|
O << ", inc: ";
|
|
}
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
|
|
if (VPValue *Mask = getMask()) {
|
|
O << ", mask: ";
|
|
Mask->printAsOperand(O, SlotTracker);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
|
|
AllowReassoc = FMF.allowReassoc();
|
|
NoNaNs = FMF.noNaNs();
|
|
NoInfs = FMF.noInfs();
|
|
NoSignedZeros = FMF.noSignedZeros();
|
|
AllowReciprocal = FMF.allowReciprocal();
|
|
AllowContract = FMF.allowContract();
|
|
ApproxFunc = FMF.approxFunc();
|
|
}
|
|
|
|
VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
|
|
switch (Opcode) {
|
|
case Instruction::Add:
|
|
case Instruction::Sub:
|
|
case Instruction::Mul:
|
|
case Instruction::Shl:
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
return WrapFlagsTy(false, false);
|
|
case Instruction::Trunc:
|
|
return TruncFlagsTy(false, false);
|
|
case Instruction::Or:
|
|
return DisjointFlagsTy(false);
|
|
case Instruction::AShr:
|
|
case Instruction::LShr:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
return ExactFlagsTy(false);
|
|
case Instruction::GetElementPtr:
|
|
case VPInstruction::PtrAdd:
|
|
case VPInstruction::WidePtrAdd:
|
|
return GEPNoWrapFlags::none();
|
|
case Instruction::ZExt:
|
|
case Instruction::UIToFP:
|
|
return NonNegFlagsTy(false);
|
|
case Instruction::FAdd:
|
|
case Instruction::FSub:
|
|
case Instruction::FMul:
|
|
case Instruction::FDiv:
|
|
case Instruction::FRem:
|
|
case Instruction::FNeg:
|
|
case Instruction::FPExt:
|
|
case Instruction::FPTrunc:
|
|
return FastMathFlags();
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp:
|
|
case VPInstruction::ComputeReductionResult:
|
|
llvm_unreachable("opcode requires explicit flags");
|
|
default:
|
|
return VPIRFlags();
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG)
|
|
bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
|
|
switch (OpType) {
|
|
case OperationType::OverflowingBinOp:
|
|
return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
|
|
Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
|
|
Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
|
|
case OperationType::Trunc:
|
|
return Opcode == Instruction::Trunc;
|
|
case OperationType::DisjointOp:
|
|
return Opcode == Instruction::Or;
|
|
case OperationType::PossiblyExactOp:
|
|
return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
|
|
Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
|
|
case OperationType::GEPOp:
|
|
return Opcode == Instruction::GetElementPtr ||
|
|
Opcode == VPInstruction::PtrAdd ||
|
|
Opcode == VPInstruction::WidePtrAdd;
|
|
case OperationType::FPMathOp:
|
|
return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
|
|
Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
|
|
Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
|
|
Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
|
|
Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
|
|
Opcode == Instruction::Select ||
|
|
Opcode == VPInstruction::WideIVStep ||
|
|
Opcode == VPInstruction::ReductionStartVector;
|
|
case OperationType::FCmp:
|
|
return Opcode == Instruction::FCmp;
|
|
case OperationType::NonNegOp:
|
|
return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
|
|
case OperationType::Cmp:
|
|
return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
|
|
case OperationType::ReductionOp:
|
|
return Opcode == VPInstruction::ComputeReductionResult;
|
|
case OperationType::Other:
|
|
return true;
|
|
}
|
|
llvm_unreachable("Unknown OperationType enum");
|
|
}
|
|
|
|
bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
|
|
// Handle opcodes without default flags.
|
|
if (Opcode == Instruction::ICmp)
|
|
return OpType == OperationType::Cmp;
|
|
if (Opcode == Instruction::FCmp)
|
|
return OpType == OperationType::FCmp;
|
|
if (Opcode == VPInstruction::ComputeReductionResult)
|
|
return OpType == OperationType::ReductionOp;
|
|
|
|
OperationType Required = getDefaultFlags(Opcode).OpType;
|
|
return Required == OperationType::Other || Required == OpType;
|
|
}
|
|
#endif
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPIRFlags::printFlags(raw_ostream &O) const {
|
|
switch (OpType) {
|
|
case OperationType::Cmp:
|
|
O << " " << CmpInst::getPredicateName(getPredicate());
|
|
break;
|
|
case OperationType::FCmp:
|
|
O << " " << CmpInst::getPredicateName(getPredicate());
|
|
getFastMathFlags().print(O);
|
|
break;
|
|
case OperationType::DisjointOp:
|
|
if (DisjointFlags.IsDisjoint)
|
|
O << " disjoint";
|
|
break;
|
|
case OperationType::PossiblyExactOp:
|
|
if (ExactFlags.IsExact)
|
|
O << " exact";
|
|
break;
|
|
case OperationType::OverflowingBinOp:
|
|
if (WrapFlags.HasNUW)
|
|
O << " nuw";
|
|
if (WrapFlags.HasNSW)
|
|
O << " nsw";
|
|
break;
|
|
case OperationType::Trunc:
|
|
if (TruncFlags.HasNUW)
|
|
O << " nuw";
|
|
if (TruncFlags.HasNSW)
|
|
O << " nsw";
|
|
break;
|
|
case OperationType::FPMathOp:
|
|
getFastMathFlags().print(O);
|
|
break;
|
|
case OperationType::GEPOp: {
|
|
GEPNoWrapFlags Flags = getGEPNoWrapFlags();
|
|
if (Flags.isInBounds())
|
|
O << " inbounds";
|
|
else if (Flags.hasNoUnsignedSignedWrap())
|
|
O << " nusw";
|
|
if (Flags.hasNoUnsignedWrap())
|
|
O << " nuw";
|
|
break;
|
|
}
|
|
case OperationType::NonNegOp:
|
|
if (NonNegFlags.NonNeg)
|
|
O << " nneg";
|
|
break;
|
|
case OperationType::ReductionOp: {
|
|
RecurKind RK = getRecurKind();
|
|
O << " (";
|
|
switch (RK) {
|
|
case RecurKind::AnyOf:
|
|
O << "any-of";
|
|
break;
|
|
case RecurKind::FindLast:
|
|
O << "find-last";
|
|
break;
|
|
case RecurKind::SMax:
|
|
O << "smax";
|
|
break;
|
|
case RecurKind::SMin:
|
|
O << "smin";
|
|
break;
|
|
case RecurKind::UMax:
|
|
O << "umax";
|
|
break;
|
|
case RecurKind::UMin:
|
|
O << "umin";
|
|
break;
|
|
case RecurKind::FMinNum:
|
|
O << "fminnum";
|
|
break;
|
|
case RecurKind::FMaxNum:
|
|
O << "fmaxnum";
|
|
break;
|
|
case RecurKind::FMinimum:
|
|
O << "fminimum";
|
|
break;
|
|
case RecurKind::FMaximum:
|
|
O << "fmaximum";
|
|
break;
|
|
case RecurKind::FMinimumNum:
|
|
O << "fminimumnum";
|
|
break;
|
|
case RecurKind::FMaximumNum:
|
|
O << "fmaximumnum";
|
|
break;
|
|
default:
|
|
O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
|
|
break;
|
|
}
|
|
if (isReductionInLoop())
|
|
O << ", in-loop";
|
|
if (isReductionOrdered())
|
|
O << ", ordered";
|
|
O << ")";
|
|
getFastMathFlags().print(O);
|
|
break;
|
|
}
|
|
case OperationType::Other:
|
|
break;
|
|
}
|
|
O << " ";
|
|
}
|
|
#endif
|
|
|
|
void VPWidenRecipe::execute(VPTransformState &State) {
|
|
auto &Builder = State.Builder;
|
|
switch (Opcode) {
|
|
case Instruction::Call:
|
|
case Instruction::UncondBr:
|
|
case Instruction::CondBr:
|
|
case Instruction::PHI:
|
|
case Instruction::GetElementPtr:
|
|
llvm_unreachable("This instruction is handled by a different recipe.");
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::SRem:
|
|
case Instruction::URem:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::FNeg:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::FDiv:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor: {
|
|
// Just widen unops and binops.
|
|
SmallVector<Value *, 2> Ops;
|
|
for (VPValue *VPOp : operands())
|
|
Ops.push_back(State.get(VPOp));
|
|
|
|
Value *V = Builder.CreateNAryOp(Opcode, Ops);
|
|
|
|
if (auto *VecOp = dyn_cast<Instruction>(V)) {
|
|
applyFlags(*VecOp);
|
|
applyMetadata(*VecOp);
|
|
}
|
|
|
|
// Use this vector value for all users of the original instruction.
|
|
State.set(this, V);
|
|
break;
|
|
}
|
|
case Instruction::ExtractValue: {
|
|
assert(getNumOperands() == 2 && "expected single level extractvalue");
|
|
Value *Op = State.get(getOperand(0));
|
|
Value *Extract = Builder.CreateExtractValue(
|
|
Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
|
|
State.set(this, Extract);
|
|
break;
|
|
}
|
|
case Instruction::Freeze: {
|
|
Value *Op = State.get(getOperand(0));
|
|
Value *Freeze = Builder.CreateFreeze(Op);
|
|
State.set(this, Freeze);
|
|
break;
|
|
}
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp: {
|
|
// Widen compares. Generate vector compares.
|
|
bool FCmp = Opcode == Instruction::FCmp;
|
|
Value *A = State.get(getOperand(0));
|
|
Value *B = State.get(getOperand(1));
|
|
Value *C = nullptr;
|
|
if (FCmp) {
|
|
C = Builder.CreateFCmp(getPredicate(), A, B);
|
|
} else {
|
|
C = Builder.CreateICmp(getPredicate(), A, B);
|
|
}
|
|
if (auto *I = dyn_cast<Instruction>(C)) {
|
|
applyFlags(*I);
|
|
applyMetadata(*I);
|
|
}
|
|
State.set(this, C);
|
|
break;
|
|
}
|
|
case Instruction::Select: {
|
|
VPValue *CondOp = getOperand(0);
|
|
Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
|
|
Value *Op0 = State.get(getOperand(1));
|
|
Value *Op1 = State.get(getOperand(2));
|
|
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
|
|
State.set(this, Sel);
|
|
if (auto *I = dyn_cast<Instruction>(Sel)) {
|
|
if (isa<FPMathOperator>(I))
|
|
applyFlags(*I);
|
|
applyMetadata(*I);
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
// This instruction is not vectorized by simple widening.
|
|
LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
|
|
<< Instruction::getOpcodeName(Opcode));
|
|
llvm_unreachable("Unhandled instruction!");
|
|
} // end of switch.
|
|
|
|
#if !defined(NDEBUG)
|
|
// Verify that VPlan type inference results agree with the type of the
|
|
// generated values.
|
|
assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
|
|
State.get(this)->getType() &&
|
|
"inferred type and type from generated instructions do not match");
|
|
#endif
|
|
}
|
|
|
|
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
switch (Opcode) {
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::SRem:
|
|
case Instruction::URem:
|
|
// If the div/rem operation isn't safe to speculate and requires
|
|
// predication, then the only way we can even create a vplan is to insert
|
|
// a select on the second input operand to ensure we use the value of 1
|
|
// for the inactive lanes. The select will be costed separately.
|
|
case Instruction::FNeg:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::FDiv:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::Freeze:
|
|
case Instruction::ExtractValue:
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp:
|
|
case Instruction::Select:
|
|
return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
|
|
default:
|
|
llvm_unreachable("Unsupported opcode for instruction");
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = " << Instruction::getOpcodeName(Opcode);
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPWidenCastRecipe::execute(VPTransformState &State) {
|
|
auto &Builder = State.Builder;
|
|
/// Vectorize casts.
|
|
assert(State.VF.isVector() && "Not vectorizing?");
|
|
Type *DestTy = VectorType::get(getResultType(), State.VF);
|
|
VPValue *Op = getOperand(0);
|
|
Value *A = State.get(Op);
|
|
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
|
|
State.set(this, Cast);
|
|
if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
|
|
applyFlags(*CastOp);
|
|
applyMetadata(*CastOp);
|
|
}
|
|
}
|
|
|
|
InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
// TODO: In some cases, VPWidenCastRecipes are created but not considered in
|
|
// the legacy cost model, including truncates/extends when evaluating a
|
|
// reduction in a smaller type.
|
|
if (!getUnderlyingValue())
|
|
return 0;
|
|
return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-CAST ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = " << Instruction::getOpcodeName(Opcode);
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
O << " to " << *getResultType();
|
|
}
|
|
#endif
|
|
|
|
InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenIntOrFpInductionRecipe::printRecipe(
|
|
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
|
|
O << Indent;
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = WIDEN-INDUCTION";
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
|
|
if (auto *TI = getTruncInst())
|
|
O << " (truncated to " << *TI->getType() << ")";
|
|
}
|
|
#endif
|
|
|
|
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
|
|
// The step may be defined by a recipe in the preheader (e.g. if it requires
|
|
// SCEV expansion), but for the canonical induction the step is required to be
|
|
// 1, which is represented as live-in.
|
|
return match(getStartValue(), m_ZeroInt()) &&
|
|
match(getStepValue(), m_One()) &&
|
|
getScalarType() == getRegion()->getCanonicalIVType();
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent;
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = DERIVED-IV ";
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
O << " + ";
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
O << " * ";
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
|
|
// Fast-math-flags propagate from the original induction instruction.
|
|
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
|
|
State.Builder.setFastMathFlags(getFastMathFlags());
|
|
|
|
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
|
|
/// variable on which to base the steps, \p Step is the size of the step.
|
|
|
|
Value *BaseIV = State.get(getOperand(0), VPLane(0));
|
|
Value *Step = State.get(getStepValue(), VPLane(0));
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
// Ensure step has the same type as that of scalar IV.
|
|
Type *BaseIVTy = BaseIV->getType()->getScalarType();
|
|
assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
|
|
|
|
// We build scalar steps for both integer and floating-point induction
|
|
// variables. Here, we determine the kind of arithmetic we will perform.
|
|
Instruction::BinaryOps AddOp;
|
|
Instruction::BinaryOps MulOp;
|
|
if (BaseIVTy->isIntegerTy()) {
|
|
AddOp = Instruction::Add;
|
|
MulOp = Instruction::Mul;
|
|
} else {
|
|
AddOp = InductionOpcode;
|
|
MulOp = Instruction::FMul;
|
|
}
|
|
|
|
// Determine the number of scalars we need to generate.
|
|
bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
|
|
// Compute the scalar steps and save the results in State.
|
|
|
|
unsigned StartLane = 0;
|
|
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
|
|
if (State.Lane) {
|
|
StartLane = State.Lane->getKnownLane();
|
|
EndLane = StartLane + 1;
|
|
}
|
|
Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
|
|
: Constant::getNullValue(BaseIVTy);
|
|
|
|
for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
|
|
// It is okay if the induction variable type cannot hold the lane number,
|
|
// we expect truncation in this case.
|
|
Constant *LaneValue =
|
|
BaseIVTy->isIntegerTy()
|
|
? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
|
|
/*ImplicitTrunc=*/true)
|
|
: ConstantFP::get(BaseIVTy, Lane);
|
|
Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
|
|
assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
|
|
"Expected StartIdx to be folded to a constant when VF is not "
|
|
"scalable");
|
|
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
|
|
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
|
|
State.set(this, Add, VPLane(Lane));
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent;
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = SCALAR-STEPS ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
return vputils::isSingleScalar(Op);
|
|
}
|
|
|
|
void VPWidenGEPRecipe::execute(VPTransformState &State) {
|
|
assert(State.VF.isVector() && "not widening");
|
|
// Construct a vector GEP by widening the operands of the scalar GEP as
|
|
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
|
|
// results in a vector of pointers when at least one operand of the GEP
|
|
// is vector-typed. Thus, to keep the representation compact, we only use
|
|
// vector-typed operands for loop-varying values.
|
|
|
|
bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
|
|
return Op->isDefinedOutsideLoopRegions();
|
|
});
|
|
if (AllOperandsAreInvariant) {
|
|
// If we are vectorizing, but the GEP has only loop-invariant operands,
|
|
// the GEP we build (by only using vector-typed operands for
|
|
// loop-varying values) would be a scalar pointer. Thus, to ensure we
|
|
// produce a vector of pointers, we need to either arbitrarily pick an
|
|
// operand to broadcast, or broadcast a clone of the original GEP.
|
|
// Here, we broadcast a clone of the original.
|
|
|
|
SmallVector<Value *> Ops;
|
|
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
|
|
Ops.push_back(State.get(getOperand(I), VPLane(0)));
|
|
|
|
auto *NewGEP =
|
|
State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
|
|
"", getGEPNoWrapFlags());
|
|
Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
|
|
State.set(this, Splat);
|
|
return;
|
|
}
|
|
|
|
// If the GEP has at least one loop-varying operand, we are sure to
|
|
// produce a vector of pointers unless VF is scalar.
|
|
// The pointer operand of the new GEP. If it's loop-invariant, we
|
|
// won't broadcast it.
|
|
auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
|
|
|
|
// Collect all the indices for the new GEP. If any index is
|
|
// loop-invariant, we won't broadcast it.
|
|
SmallVector<Value *, 4> Indices;
|
|
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
|
|
VPValue *Operand = getOperand(I);
|
|
Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
|
|
}
|
|
|
|
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
|
|
// but it should be a vector, otherwise.
|
|
auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
|
|
"", getGEPNoWrapFlags());
|
|
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
|
|
"NewGEP is not a pointer vector");
|
|
State.set(this, NewGEP);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-GEP ";
|
|
O << (isPointerLoopInvariant() ? "Inv" : "Var");
|
|
for (size_t I = 0; I < getNumOperands() - 1; ++I)
|
|
O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
|
|
|
|
O << " ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = getelementptr";
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
|
|
assert(!getOffset() && "Unexpected offset operand");
|
|
VPBuilder Builder(this);
|
|
VPlan &Plan = *getParent()->getPlan();
|
|
VPValue *VFVal = getVFValue();
|
|
VPTypeAnalysis TypeInfo(Plan);
|
|
const DataLayout &DL = Plan.getDataLayout();
|
|
Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
|
|
VPValue *Stride =
|
|
Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
|
|
Type *VFTy = TypeInfo.inferScalarType(VFVal);
|
|
VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
|
|
DebugLoc::getUnknown());
|
|
|
|
// Offset for Part0 = Offset0 = Stride * (VF - 1).
|
|
VPInstruction *VFMinusOne =
|
|
Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
|
|
DebugLoc::getUnknown(), "", {true, true});
|
|
VPInstruction *Offset0 =
|
|
Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
|
|
|
|
// Offset for PartN = Offset0 + Part * Stride * VF.
|
|
VPValue *PartxStride =
|
|
Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
|
|
VPValue *Offset = Builder.createAdd(
|
|
Offset0,
|
|
Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
|
|
addOperand(Offset);
|
|
}
|
|
|
|
void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
|
|
auto &Builder = State.Builder;
|
|
assert(getOffset() && "Expected prior materialization of offset");
|
|
Value *Ptr = State.get(getPointer(), true);
|
|
Value *Offset = State.get(getOffset(), true);
|
|
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
|
|
getGEPNoWrapFlags());
|
|
State.set(this, ResultPtr, /*IsScalar*/ true);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent;
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = vector-end-pointer";
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPVectorPointerRecipe::execute(VPTransformState &State) {
|
|
auto &Builder = State.Builder;
|
|
assert(getOffset() &&
|
|
"Expected prior simplification of recipe without offset");
|
|
Value *Ptr = State.get(getOperand(0), VPLane(0));
|
|
Value *Offset = State.get(getOffset(), true);
|
|
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
|
|
getGEPNoWrapFlags());
|
|
State.set(this, ResultPtr, /*IsScalar*/ true);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent;
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = vector-pointer";
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
// A blend will be expanded to a select VPInstruction, which will generate a
|
|
// scalar select if only the first lane is used.
|
|
if (vputils::onlyFirstLaneUsed(this))
|
|
VF = ElementCount::getFixed(1);
|
|
|
|
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
|
|
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
|
|
return (getNumIncomingValues() - 1) *
|
|
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
|
|
CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "BLEND ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " =";
|
|
printFlags(O);
|
|
if (getNumIncomingValues() == 1) {
|
|
// Not a User of any mask: not really blending, this is a
|
|
// single-predecessor phi.
|
|
getIncomingValue(0)->printAsOperand(O, SlotTracker);
|
|
} else {
|
|
for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
|
|
if (I != 0)
|
|
O << " ";
|
|
getIncomingValue(I)->printAsOperand(O, SlotTracker);
|
|
if (I == 0 && isNormalized())
|
|
continue;
|
|
O << "/";
|
|
getMask(I)->printAsOperand(O, SlotTracker);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void VPReductionRecipe::execute(VPTransformState &State) {
|
|
assert(!State.Lane && "Reduction being replicated.");
|
|
RecurKind Kind = getRecurrenceKind();
|
|
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
|
|
"In-loop AnyOf reductions aren't currently supported");
|
|
// Propagate the fast-math flags carried by the underlying instruction.
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
|
|
State.Builder.setFastMathFlags(getFastMathFlags());
|
|
Value *NewVecOp = State.get(getVecOp());
|
|
if (VPValue *Cond = getCondOp()) {
|
|
Value *NewCond = State.get(Cond, State.VF.isScalar());
|
|
VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
|
|
Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
|
|
|
|
Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
|
|
if (State.VF.isVector())
|
|
Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
|
|
|
|
Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
|
|
NewVecOp = Select;
|
|
}
|
|
Value *NewRed;
|
|
Value *NextInChain;
|
|
if (isOrdered()) {
|
|
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
|
|
if (State.VF.isVector())
|
|
NewRed =
|
|
createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
|
|
else
|
|
NewRed = State.Builder.CreateBinOp(
|
|
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
|
|
PrevInChain, NewVecOp);
|
|
PrevInChain = NewRed;
|
|
NextInChain = NewRed;
|
|
} else if (isPartialReduction()) {
|
|
assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
|
|
"Unexpected partial reduction kind");
|
|
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
|
|
NewRed = State.Builder.CreateIntrinsic(
|
|
PrevInChain->getType(),
|
|
Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
|
|
: Intrinsic::vector_partial_reduce_fadd,
|
|
{PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
|
|
"partial.reduce");
|
|
PrevInChain = NewRed;
|
|
NextInChain = NewRed;
|
|
} else {
|
|
assert(isInLoop() &&
|
|
"The reduction must either be ordered, partial or in-loop");
|
|
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
|
|
NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
|
|
NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
|
|
else
|
|
NextInChain = State.Builder.CreateBinOp(
|
|
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
|
|
PrevInChain, NewRed);
|
|
}
|
|
State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
|
|
}
|
|
|
|
void VPReductionEVLRecipe::execute(VPTransformState &State) {
|
|
assert(!State.Lane && "Reduction being replicated.");
|
|
|
|
auto &Builder = State.Builder;
|
|
// Propagate the fast-math flags carried by the underlying instruction.
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
|
|
Builder.setFastMathFlags(getFastMathFlags());
|
|
|
|
RecurKind Kind = getRecurrenceKind();
|
|
Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
|
|
Value *VecOp = State.get(getVecOp());
|
|
Value *EVL = State.get(getEVL(), VPLane(0));
|
|
|
|
Value *Mask;
|
|
if (VPValue *CondOp = getCondOp())
|
|
Mask = State.get(CondOp);
|
|
else
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
Value *NewRed;
|
|
if (isOrdered()) {
|
|
NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
|
|
} else {
|
|
NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
|
|
NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
|
|
else
|
|
NewRed = Builder.CreateBinOp(
|
|
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
|
|
Prev);
|
|
}
|
|
State.set(this, NewRed, /*IsScalar*/ true);
|
|
}
|
|
|
|
InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
RecurKind RdxKind = getRecurrenceKind();
|
|
Type *ElementTy = Ctx.Types.inferScalarType(this);
|
|
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
|
|
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
|
|
FastMathFlags FMFs = getFastMathFlags();
|
|
std::optional<FastMathFlags> OptionalFMF =
|
|
ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
|
|
|
|
if (isPartialReduction()) {
|
|
InstructionCost CondCost = 0;
|
|
if (isConditional()) {
|
|
CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
|
|
auto *CondTy = cast<VectorType>(
|
|
toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
|
|
CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
|
|
CondTy, Pred, Ctx.CostKind);
|
|
}
|
|
return CondCost + Ctx.TTI.getPartialReductionCost(
|
|
Opcode, ElementTy, ElementTy, ElementTy, VF,
|
|
TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
|
|
OptionalFMF);
|
|
}
|
|
|
|
// TODO: Support any-of reductions.
|
|
assert(
|
|
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
|
|
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
|
|
"Any-of reduction not implemented in VPlan-based cost model currently.");
|
|
|
|
// Note that TTI should model the cost of moving result to the scalar register
|
|
// and the BinOp cost in the getMinMaxReductionCost().
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
|
|
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
|
|
return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
|
|
}
|
|
|
|
// Note that TTI should model the cost of moving result to the scalar register
|
|
// and the BinOp cost in the getArithmeticReductionCost().
|
|
return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
|
|
Ctx.CostKind);
|
|
}
|
|
|
|
VPExpressionRecipe::VPExpressionRecipe(
|
|
ExpressionTypes ExpressionType,
|
|
ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
|
|
: VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
|
|
ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
|
|
assert(!ExpressionRecipes.empty() && "Nothing to combine?");
|
|
assert(
|
|
none_of(ExpressionRecipes,
|
|
[](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
|
|
"expression cannot contain recipes with side-effects");
|
|
|
|
// Maintain a copy of the expression recipes as a set of users.
|
|
SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
|
|
for (auto *R : ExpressionRecipes)
|
|
ExpressionRecipesAsSetOfUsers.insert(R);
|
|
|
|
// Recipes in the expression, except the last one, must only be used by
|
|
// (other) recipes inside the expression. If there are other users, external
|
|
// to the expression, use a clone of the recipe for external users.
|
|
for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
|
|
if (R != ExpressionRecipes.back() &&
|
|
any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
|
|
return !ExpressionRecipesAsSetOfUsers.contains(U);
|
|
})) {
|
|
// There are users outside of the expression. Clone the recipe and use the
|
|
// clone those external users.
|
|
VPSingleDefRecipe *CopyForExtUsers = R->clone();
|
|
R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
|
|
VPUser &U, unsigned) {
|
|
return !ExpressionRecipesAsSetOfUsers.contains(&U);
|
|
});
|
|
CopyForExtUsers->insertBefore(R);
|
|
}
|
|
if (R->getParent())
|
|
R->removeFromParent();
|
|
}
|
|
|
|
// Internalize all external operands to the expression recipes. To do so,
|
|
// create new temporary VPValues for all operands defined by a recipe outside
|
|
// the expression. The original operands are added as operands of the
|
|
// VPExpressionRecipe itself.
|
|
for (auto *R : ExpressionRecipes) {
|
|
for (const auto &[Idx, Op] : enumerate(R->operands())) {
|
|
auto *Def = Op->getDefiningRecipe();
|
|
if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
|
|
continue;
|
|
addOperand(Op);
|
|
LiveInPlaceholders.push_back(new VPSymbolicValue());
|
|
}
|
|
}
|
|
|
|
// Replace each external operand with the first one created for it in
|
|
// LiveInPlaceholders.
|
|
for (auto *R : ExpressionRecipes)
|
|
for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
|
|
R->replaceUsesOfWith(LiveIn, Tmp);
|
|
}
|
|
|
|
void VPExpressionRecipe::decompose() {
|
|
for (auto *R : ExpressionRecipes)
|
|
// Since the list could contain duplicates, make sure the recipe hasn't
|
|
// already been inserted.
|
|
if (!R->getParent())
|
|
R->insertBefore(this);
|
|
|
|
for (const auto &[Idx, Op] : enumerate(operands()))
|
|
LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
|
|
|
|
replaceAllUsesWith(ExpressionRecipes.back());
|
|
ExpressionRecipes.clear();
|
|
}
|
|
|
|
InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
Type *RedTy = Ctx.Types.inferScalarType(this);
|
|
auto *SrcVecTy = cast<VectorType>(
|
|
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
|
|
unsigned Opcode = RecurrenceDescriptor::getOpcode(
|
|
cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
|
|
switch (ExpressionType) {
|
|
case ExpressionTypes::ExtendedReduction: {
|
|
unsigned Opcode = RecurrenceDescriptor::getOpcode(
|
|
cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
|
|
auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
|
|
auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
|
|
|
|
if (RedR->isPartialReduction())
|
|
return Ctx.TTI.getPartialReductionCost(
|
|
Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
|
|
TargetTransformInfo::getPartialReductionExtendKind(ExtR->getOpcode()),
|
|
TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
|
|
RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
|
|
: std::nullopt);
|
|
else if (!RedTy->isFloatingPointTy())
|
|
// TTI::getExtendedReductionCost only supports integer types.
|
|
return Ctx.TTI.getExtendedReductionCost(
|
|
Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
|
|
std::nullopt, Ctx.CostKind);
|
|
else
|
|
return InstructionCost::getInvalid();
|
|
}
|
|
case ExpressionTypes::MulAccReduction:
|
|
return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
|
|
Ctx.CostKind);
|
|
|
|
case ExpressionTypes::ExtNegatedMulAccReduction:
|
|
assert(Opcode == Instruction::Add && "Unexpected opcode");
|
|
Opcode = Instruction::Sub;
|
|
[[fallthrough]];
|
|
case ExpressionTypes::ExtMulAccReduction: {
|
|
auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
|
|
if (RedR->isPartialReduction()) {
|
|
auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
|
|
auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
|
|
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
|
|
return Ctx.TTI.getPartialReductionCost(
|
|
Opcode, Ctx.Types.inferScalarType(getOperand(0)),
|
|
Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
|
|
TargetTransformInfo::getPartialReductionExtendKind(
|
|
Ext0R->getOpcode()),
|
|
TargetTransformInfo::getPartialReductionExtendKind(
|
|
Ext1R->getOpcode()),
|
|
Mul->getOpcode(), Ctx.CostKind,
|
|
RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
|
|
: std::nullopt);
|
|
}
|
|
return Ctx.TTI.getMulAccReductionCost(
|
|
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
|
|
Instruction::ZExt,
|
|
Opcode, RedTy, SrcVecTy, Ctx.CostKind);
|
|
}
|
|
}
|
|
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
|
|
}
|
|
|
|
bool VPExpressionRecipe::mayReadOrWriteMemory() const {
|
|
return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
|
|
return R->mayReadFromMemory() || R->mayWriteToMemory();
|
|
});
|
|
}
|
|
|
|
bool VPExpressionRecipe::mayHaveSideEffects() const {
|
|
assert(
|
|
none_of(ExpressionRecipes,
|
|
[](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
|
|
"expression cannot contain recipes with side-effects");
|
|
return false;
|
|
}
|
|
|
|
bool VPExpressionRecipe::isSingleScalar() const {
|
|
// Cannot use vputils::isSingleScalar(), because all external operands
|
|
// of the expression will be live-ins while bundled.
|
|
auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
|
|
return RR && !RR->isPartialReduction();
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EXPRESSION ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
|
|
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
|
|
|
|
switch (ExpressionType) {
|
|
case ExpressionTypes::ExtendedReduction: {
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
|
|
O << Instruction::getOpcodeName(Opcode) << " (";
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
Red->printFlags(O);
|
|
|
|
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
|
|
O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
|
|
<< *Ext0->getResultType();
|
|
if (Red->isConditional()) {
|
|
O << ", ";
|
|
Red->getCondOp()->printAsOperand(O, SlotTracker);
|
|
}
|
|
O << ")";
|
|
break;
|
|
}
|
|
case ExpressionTypes::ExtNegatedMulAccReduction: {
|
|
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
|
|
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
|
|
O << Instruction::getOpcodeName(
|
|
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
|
|
<< " (sub (0, mul";
|
|
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
|
|
Mul->printFlags(O);
|
|
O << "(";
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
|
|
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
|
|
<< *Ext0->getResultType() << "), (";
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
|
|
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
|
|
<< *Ext1->getResultType() << ")";
|
|
if (Red->isConditional()) {
|
|
O << ", ";
|
|
Red->getCondOp()->printAsOperand(O, SlotTracker);
|
|
}
|
|
O << "))";
|
|
break;
|
|
}
|
|
case ExpressionTypes::MulAccReduction:
|
|
case ExpressionTypes::ExtMulAccReduction: {
|
|
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
|
|
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
|
|
O << Instruction::getOpcodeName(
|
|
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
|
|
<< " (";
|
|
O << "mul";
|
|
bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
|
|
auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
|
|
: ExpressionRecipes[0]);
|
|
Mul->printFlags(O);
|
|
if (IsExtended)
|
|
O << "(";
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
if (IsExtended) {
|
|
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
|
|
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
|
|
<< *Ext0->getResultType() << "), (";
|
|
} else {
|
|
O << ", ";
|
|
}
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
if (IsExtended) {
|
|
auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
|
|
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
|
|
<< *Ext1->getResultType() << ")";
|
|
}
|
|
if (Red->isConditional()) {
|
|
O << ", ";
|
|
Red->getCondOp()->printAsOperand(O, SlotTracker);
|
|
}
|
|
O << ")";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
if (isPartialReduction())
|
|
O << Indent << "PARTIAL-REDUCE ";
|
|
else
|
|
O << Indent << "REDUCE ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
getChainOp()->printAsOperand(O, SlotTracker);
|
|
O << " +";
|
|
printFlags(O);
|
|
O << " reduce."
|
|
<< Instruction::getOpcodeName(
|
|
RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
|
|
<< " (";
|
|
getVecOp()->printAsOperand(O, SlotTracker);
|
|
if (isConditional()) {
|
|
O << ", ";
|
|
getCondOp()->printAsOperand(O, SlotTracker);
|
|
}
|
|
O << ")";
|
|
}
|
|
|
|
void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "REDUCE ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
getChainOp()->printAsOperand(O, SlotTracker);
|
|
O << " +";
|
|
printFlags(O);
|
|
O << " vp.reduce."
|
|
<< Instruction::getOpcodeName(
|
|
RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
|
|
<< " (";
|
|
getVecOp()->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getEVL()->printAsOperand(O, SlotTracker);
|
|
if (isConditional()) {
|
|
O << ", ";
|
|
getCondOp()->printAsOperand(O, SlotTracker);
|
|
}
|
|
O << ")";
|
|
}
|
|
|
|
#endif
|
|
|
|
/// A helper function to scalarize a single Instruction in the innermost loop.
|
|
/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
|
|
/// operands from \p RepRecipe instead of \p Instr's operands.
|
|
static void scalarizeInstruction(const Instruction *Instr,
|
|
VPReplicateRecipe *RepRecipe,
|
|
const VPLane &Lane, VPTransformState &State) {
|
|
assert((!Instr->getType()->isAggregateType() ||
|
|
canVectorizeTy(Instr->getType())) &&
|
|
"Expected vectorizable or non-aggregate type.");
|
|
|
|
// Does this instruction return a value ?
|
|
bool IsVoidRetTy = Instr->getType()->isVoidTy();
|
|
|
|
Instruction *Cloned = Instr->clone();
|
|
if (!IsVoidRetTy) {
|
|
Cloned->setName(Instr->getName() + ".cloned");
|
|
Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
|
|
// The operands of the replicate recipe may have been narrowed, resulting in
|
|
// a narrower result type. Update the type of the cloned instruction to the
|
|
// correct type.
|
|
if (ResultTy != Cloned->getType())
|
|
Cloned->mutateType(ResultTy);
|
|
}
|
|
|
|
RepRecipe->applyFlags(*Cloned);
|
|
RepRecipe->applyMetadata(*Cloned);
|
|
|
|
if (RepRecipe->hasPredicate())
|
|
cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
|
|
|
|
if (auto DL = RepRecipe->getDebugLoc())
|
|
State.setDebugLocFrom(DL);
|
|
|
|
// Replace the operands of the cloned instructions with their scalar
|
|
// equivalents in the new loop.
|
|
for (const auto &I : enumerate(RepRecipe->operands())) {
|
|
auto InputLane = Lane;
|
|
VPValue *Operand = I.value();
|
|
if (vputils::isSingleScalar(Operand))
|
|
InputLane = VPLane::getFirstLane();
|
|
Cloned->setOperand(I.index(), State.get(Operand, InputLane));
|
|
}
|
|
|
|
// Place the cloned scalar in the new loop.
|
|
State.Builder.Insert(Cloned);
|
|
|
|
State.set(RepRecipe, Cloned, Lane);
|
|
|
|
// If we just cloned a new assumption, add it the assumption cache.
|
|
if (auto *II = dyn_cast<AssumeInst>(Cloned))
|
|
State.AC->registerAssumption(II);
|
|
|
|
assert(
|
|
(RepRecipe->getRegion() ||
|
|
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
|
|
all_of(RepRecipe->operands(),
|
|
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
|
|
"Expected a recipe is either within a region or all of its operands "
|
|
"are defined outside the vectorized region.");
|
|
}
|
|
|
|
void VPReplicateRecipe::execute(VPTransformState &State) {
|
|
Instruction *UI = getUnderlyingInstr();
|
|
|
|
if (!State.Lane) {
|
|
assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
|
|
"must have already been unrolled");
|
|
scalarizeInstruction(UI, this, VPLane(0), State);
|
|
return;
|
|
}
|
|
|
|
assert((State.VF.isScalar() || !isSingleScalar()) &&
|
|
"uniform recipe shouldn't be predicated");
|
|
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
|
|
scalarizeInstruction(UI, this, *State.Lane, State);
|
|
// Insert scalar instance packing it into a vector.
|
|
if (State.VF.isVector() && shouldPack()) {
|
|
Value *WideValue =
|
|
State.Lane->isFirstLane()
|
|
? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
|
|
: State.get(this);
|
|
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
|
|
*State.Lane));
|
|
}
|
|
}
|
|
|
|
bool VPReplicateRecipe::shouldPack() const {
|
|
// Find if the recipe is used by a widened recipe via an intervening
|
|
// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
|
|
return any_of(users(), [](const VPUser *U) {
|
|
if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
|
|
return !vputils::onlyScalarValuesUsed(PredR);
|
|
return false;
|
|
});
|
|
}
|
|
|
|
/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
|
|
/// which the legacy cost model computes a SCEV expression when computing the
|
|
/// address cost. Computing SCEVs for VPValues is incomplete and returns
|
|
/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
|
|
/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
|
|
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
|
|
PredicatedScalarEvolution &PSE,
|
|
const Loop *L) {
|
|
const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
|
|
if (isa<SCEVCouldNotCompute>(Addr))
|
|
return Addr;
|
|
|
|
return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
|
|
}
|
|
|
|
/// Returns true if \p V is used as part of the address of another load or
|
|
/// store.
|
|
static bool isUsedByLoadStoreAddress(const VPUser *V) {
|
|
SmallPtrSet<const VPUser *, 4> Seen;
|
|
SmallVector<const VPUser *> WorkList = {V};
|
|
|
|
while (!WorkList.empty()) {
|
|
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
|
|
if (!Cur || !Seen.insert(Cur).second)
|
|
continue;
|
|
|
|
auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
|
|
// Skip blends that use V only through a compare by checking if any incoming
|
|
// value was already visited.
|
|
if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
|
|
[&](unsigned I) {
|
|
return Seen.contains(
|
|
Blend->getIncomingValue(I)->getDefiningRecipe());
|
|
}))
|
|
continue;
|
|
|
|
for (VPUser *U : Cur->users()) {
|
|
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
|
|
if (InterleaveR->getAddr() == Cur)
|
|
return true;
|
|
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
|
|
if (RepR->getOpcode() == Instruction::Load &&
|
|
RepR->getOperand(0) == Cur)
|
|
return true;
|
|
if (RepR->getOpcode() == Instruction::Store &&
|
|
RepR->getOperand(1) == Cur)
|
|
return true;
|
|
}
|
|
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
|
|
if (MemR->getAddr() == Cur && MemR->isConsecutive())
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// The legacy cost model only supports scalarization loads/stores with phi
|
|
// addresses, if the phi is directly used as load/store address. Don't
|
|
// traverse further for Blends.
|
|
if (Blend)
|
|
continue;
|
|
|
|
append_range(WorkList, Cur->users());
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Return true if \p R is a predicated load/store with a loop-invariant address
|
|
/// only masked by the header mask.
|
|
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R,
|
|
const SCEV *PtrSCEV,
|
|
VPCostContext &Ctx) {
|
|
const VPRegionBlock *ParentRegion = R.getRegion();
|
|
if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
|
|
!Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
|
|
return false;
|
|
auto *BOM =
|
|
cast<VPBranchOnMaskRecipe>(&ParentRegion->getEntryBasicBlock()->front());
|
|
return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
|
|
}
|
|
|
|
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
Instruction *UI = cast<Instruction>(getUnderlyingValue());
|
|
// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
|
|
// transform, avoid computing their cost multiple times for now.
|
|
Ctx.SkipCostComputation.insert(UI);
|
|
|
|
if (VF.isScalable() && !isSingleScalar())
|
|
return InstructionCost::getInvalid();
|
|
|
|
switch (UI->getOpcode()) {
|
|
case Instruction::Alloca:
|
|
if (VF.isScalable())
|
|
return InstructionCost::getInvalid();
|
|
return Ctx.TTI.getArithmeticInstrCost(
|
|
Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
|
|
case Instruction::GetElementPtr:
|
|
// We mark this instruction as zero-cost because the cost of GEPs in
|
|
// vectorized code depends on whether the corresponding memory instruction
|
|
// is scalarized or not. Therefore, we handle GEPs with the memory
|
|
// instruction cost.
|
|
return 0;
|
|
case Instruction::Call: {
|
|
auto *CalledFn =
|
|
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
|
|
|
|
SmallVector<const VPValue *> ArgOps(drop_end(operands()));
|
|
SmallVector<Type *, 4> Tys;
|
|
for (const VPValue *ArgOp : ArgOps)
|
|
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
|
|
|
|
if (CalledFn->isIntrinsic())
|
|
// Various pseudo-intrinsics with costs of 0 are scalarized instead of
|
|
// vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
|
|
switch (CalledFn->getIntrinsicID()) {
|
|
case Intrinsic::assume:
|
|
case Intrinsic::lifetime_end:
|
|
case Intrinsic::lifetime_start:
|
|
case Intrinsic::sideeffect:
|
|
case Intrinsic::pseudoprobe:
|
|
case Intrinsic::experimental_noalias_scope_decl: {
|
|
assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
|
|
ElementCount::getFixed(1), Ctx) == 0 &&
|
|
"scalarizing intrinsic should be free");
|
|
return InstructionCost(0);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
Type *ResultTy = Ctx.Types.inferScalarType(this);
|
|
InstructionCost ScalarCallCost =
|
|
Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
|
|
if (isSingleScalar()) {
|
|
if (CalledFn->isIntrinsic())
|
|
ScalarCallCost = std::min(
|
|
ScalarCallCost,
|
|
getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
|
|
ElementCount::getFixed(1), Ctx));
|
|
return ScalarCallCost;
|
|
}
|
|
|
|
return ScalarCallCost * VF.getFixedValue() +
|
|
Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
|
|
}
|
|
case Instruction::Add:
|
|
case Instruction::Sub:
|
|
case Instruction::FAdd:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::FDiv:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp:
|
|
return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
|
|
Ctx) *
|
|
(isSingleScalar() ? 1 : VF.getFixedValue());
|
|
case Instruction::SDiv:
|
|
case Instruction::UDiv:
|
|
case Instruction::SRem:
|
|
case Instruction::URem: {
|
|
InstructionCost ScalarCost =
|
|
getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
|
|
if (isSingleScalar())
|
|
return ScalarCost;
|
|
|
|
// If any of the operands is from a different replicate region and has its
|
|
// cost skipped, it may have been forced to scalar. Fall back to legacy cost
|
|
// model to avoid cost mis-match.
|
|
if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
|
|
auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
|
|
if (!PredR)
|
|
return false;
|
|
return Ctx.skipCostComputation(
|
|
dyn_cast_or_null<Instruction>(
|
|
PredR->getOperand(0)->getUnderlyingValue()),
|
|
VF.isVector());
|
|
}))
|
|
break;
|
|
|
|
ScalarCost = ScalarCost * VF.getFixedValue() +
|
|
Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
|
|
to_vector(operands()), VF);
|
|
// If the recipe is not predicated (i.e. not in a replicate region), return
|
|
// the scalar cost. Otherwise handle predicated cost.
|
|
if (!getRegion()->isReplicator())
|
|
return ScalarCost;
|
|
|
|
// Account for the phi nodes that we will create.
|
|
ScalarCost += VF.getFixedValue() *
|
|
Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
|
|
// Scale the cost by the probability of executing the predicated blocks.
|
|
// This assumes the predicated block for each vector lane is equally
|
|
// likely.
|
|
ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
|
|
return ScalarCost;
|
|
}
|
|
case Instruction::Load:
|
|
case Instruction::Store: {
|
|
bool IsLoad = UI->getOpcode() == Instruction::Load;
|
|
const VPValue *PtrOp = getOperand(!IsLoad);
|
|
const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
|
|
if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
|
|
break;
|
|
|
|
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
|
|
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
|
|
const Align Alignment = getLoadStoreAlignment(UI);
|
|
unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
|
|
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
|
|
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
|
|
bool UsedByLoadStoreAddress =
|
|
!PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
|
|
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
|
|
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
|
|
UsedByLoadStoreAddress ? UI : nullptr);
|
|
|
|
// Check if this is a predicated load/store with a loop-invariant address
|
|
// only masked by the header mask. If so, return the uniform mem op cost.
|
|
if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
|
|
InstructionCost UniformCost =
|
|
ScalarMemOpCost +
|
|
Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
|
|
/*Ptr=*/nullptr, Ctx.CostKind);
|
|
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
|
|
if (IsLoad) {
|
|
return UniformCost +
|
|
Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
|
|
VectorTy, VectorTy, {}, Ctx.CostKind);
|
|
}
|
|
|
|
VPValue *StoredVal = getOperand(0);
|
|
if (!StoredVal->isDefinedOutsideLoopRegions())
|
|
UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
|
|
Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
|
|
return UniformCost;
|
|
}
|
|
|
|
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
|
|
InstructionCost ScalarCost =
|
|
ScalarMemOpCost +
|
|
Ctx.TTI.getAddressComputationCost(
|
|
PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
|
|
Ctx.CostKind);
|
|
if (isSingleScalar())
|
|
return ScalarCost;
|
|
|
|
SmallVector<const VPValue *> OpsToScalarize;
|
|
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
|
|
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
|
|
// don't assign scalarization overhead in general, if the target prefers
|
|
// vectorized addressing or the loaded value is used as part of an address
|
|
// of another load or store.
|
|
if (!UsedByLoadStoreAddress) {
|
|
bool EfficientVectorLoadStore =
|
|
Ctx.TTI.supportsEfficientVectorElementLoadStore();
|
|
if (!(IsLoad && !PreferVectorizedAddressing) &&
|
|
!(!IsLoad && EfficientVectorLoadStore))
|
|
append_range(OpsToScalarize, operands());
|
|
|
|
if (!EfficientVectorLoadStore)
|
|
ResultTy = Ctx.Types.inferScalarType(this);
|
|
}
|
|
|
|
TTI::VectorInstrContext VIC =
|
|
IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
|
|
InstructionCost Cost =
|
|
(ScalarCost * VF.getFixedValue()) +
|
|
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
|
|
|
|
const VPRegionBlock *ParentRegion = getRegion();
|
|
if (ParentRegion && ParentRegion->isReplicator()) {
|
|
if (!PtrSCEV)
|
|
break;
|
|
Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
|
|
Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
|
|
|
|
auto *VecI1Ty = VectorType::get(
|
|
IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
|
|
Cost += Ctx.TTI.getScalarizationOverhead(
|
|
VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
|
|
/*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
|
|
|
|
if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
|
|
// Artificially setting to a high enough value to practically disable
|
|
// vectorization with such operations.
|
|
return 3000000;
|
|
}
|
|
}
|
|
return Cost;
|
|
}
|
|
case Instruction::SExt:
|
|
case Instruction::ZExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::PtrToAddr:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::Select:
|
|
case Instruction::AddrSpaceCast: {
|
|
return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
|
|
Ctx) *
|
|
(isSingleScalar() ? 1 : VF.getFixedValue());
|
|
}
|
|
case Instruction::ExtractValue:
|
|
case Instruction::InsertValue:
|
|
return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
|
|
}
|
|
|
|
return Ctx.getLegacyCost(UI, VF);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
|
|
|
|
if (!getUnderlyingInstr()->getType()->isVoidTy()) {
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
}
|
|
if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
|
|
O << "call";
|
|
printFlags(O);
|
|
O << "@" << CB->getCalledFunction()->getName() << "(";
|
|
interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
|
|
O, [&O, &SlotTracker](VPValue *Op) {
|
|
Op->printAsOperand(O, SlotTracker);
|
|
});
|
|
O << ")";
|
|
} else {
|
|
O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
|
|
if (shouldPack())
|
|
O << " (S->V)";
|
|
}
|
|
#endif
|
|
|
|
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
|
|
assert(State.Lane && "Branch on Mask works only on single instance.");
|
|
|
|
VPValue *BlockInMask = getOperand(0);
|
|
Value *ConditionBit = State.get(BlockInMask, *State.Lane);
|
|
|
|
// Replace the temporary unreachable terminator with a new conditional branch,
|
|
// whose two destinations will be set later when they are created.
|
|
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
|
|
assert(isa<UnreachableInst>(CurrentTerminator) &&
|
|
"Expected to replace unreachable terminator with conditional branch.");
|
|
auto CondBr =
|
|
State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
|
|
CondBr->setSuccessor(0, nullptr);
|
|
CurrentTerminator->eraseFromParent();
|
|
}
|
|
|
|
InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
// The legacy cost model doesn't assign costs to branches for individual
|
|
// replicate regions. Match the current behavior in the VPlan cost model for
|
|
// now.
|
|
return 0;
|
|
}
|
|
|
|
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
|
|
assert(State.Lane && "Predicated instruction PHI works per instance.");
|
|
Instruction *ScalarPredInst =
|
|
cast<Instruction>(State.get(getOperand(0), *State.Lane));
|
|
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
|
|
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
|
|
assert(PredicatingBB && "Predicated block has no single predecessor.");
|
|
assert(isa<VPReplicateRecipe>(getOperand(0)) &&
|
|
"operand must be VPReplicateRecipe");
|
|
|
|
// By current pack/unpack logic we need to generate only a single phi node: if
|
|
// a vector value for the predicated instruction exists at this point it means
|
|
// the instruction has vector users only, and a phi for the vector value is
|
|
// needed. In this case the recipe of the predicated instruction is marked to
|
|
// also do that packing, thereby "hoisting" the insert-element sequence.
|
|
// Otherwise, a phi node for the scalar value is needed.
|
|
if (State.hasVectorValue(getOperand(0))) {
|
|
auto *VecI = cast<Instruction>(State.get(getOperand(0)));
|
|
assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
|
|
"Packed operands must generate an insertelement or insertvalue");
|
|
|
|
// If VectorI is a struct, it will be a sequence like:
|
|
// %1 = insertvalue %unmodified, %x, 0
|
|
// %2 = insertvalue %1, %y, 1
|
|
// %VectorI = insertvalue %2, %z, 2
|
|
// To get the unmodified vector we need to look through the chain.
|
|
if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
|
|
for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
|
|
VecI = cast<InsertValueInst>(VecI->getOperand(0));
|
|
|
|
PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
|
|
VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
|
|
VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
|
|
if (State.hasVectorValue(this))
|
|
State.reset(this, VPhi);
|
|
else
|
|
State.set(this, VPhi);
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
State.reset(getOperand(0), VPhi);
|
|
} else {
|
|
if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
|
|
return;
|
|
|
|
Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
|
|
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
|
|
Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
|
|
PredicatingBB);
|
|
Phi->addIncoming(ScalarPredInst, PredicatedBB);
|
|
if (State.hasScalarValue(this, *State.Lane))
|
|
State.reset(this, Phi, *State.Lane);
|
|
else
|
|
State.set(this, Phi, *State.Lane);
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
State.reset(getOperand(0), Phi, *State.Lane);
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "PHI-PREDICATED-INSTRUCTION ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
|
|
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
|
|
->getAddressSpace();
|
|
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
|
|
? Instruction::Load
|
|
: Instruction::Store;
|
|
|
|
if (!Consecutive) {
|
|
// TODO: Using the original IR may not be accurate.
|
|
// Currently, ARM will use the underlying IR to calculate gather/scatter
|
|
// instruction cost.
|
|
[[maybe_unused]] auto IsReverseMask = [this]() {
|
|
VPValue *Mask = getMask();
|
|
if (!Mask)
|
|
return false;
|
|
|
|
if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(this))
|
|
return match(Mask, m_Intrinsic<Intrinsic::experimental_vp_reverse>());
|
|
|
|
return match(Mask, m_Reverse(m_VPValue()));
|
|
};
|
|
assert(!IsReverseMask() &&
|
|
"Inconsecutive memory access should not have reverse order");
|
|
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
|
|
Type *PtrTy = Ptr->getType();
|
|
|
|
// If the address value is uniform across all lanes, then the address can be
|
|
// calculated with scalar type and broadcast.
|
|
if (!vputils::isSingleScalar(getAddr()))
|
|
PtrTy = toVectorTy(PtrTy, VF);
|
|
|
|
unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
|
|
: isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
|
|
: isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
|
|
: Intrinsic::vp_scatter;
|
|
return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
|
|
Ctx.CostKind) +
|
|
Ctx.TTI.getMemIntrinsicInstrCost(
|
|
MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
|
|
&Ingredient),
|
|
Ctx.CostKind);
|
|
}
|
|
|
|
InstructionCost Cost = 0;
|
|
if (IsMasked) {
|
|
unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
|
|
: Intrinsic::masked_store;
|
|
Cost += Ctx.TTI.getMemIntrinsicInstrCost(
|
|
MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
|
|
} else {
|
|
TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
|
|
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
|
|
: getOperand(1));
|
|
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
|
|
OpInfo, &Ingredient);
|
|
}
|
|
return Cost;
|
|
}
|
|
|
|
void VPWidenLoadRecipe::execute(VPTransformState &State) {
|
|
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
|
|
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
|
|
bool CreateGather = !isConsecutive();
|
|
|
|
auto &Builder = State.Builder;
|
|
Value *Mask = nullptr;
|
|
if (auto *VPMask = getMask())
|
|
Mask = State.get(VPMask);
|
|
|
|
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
|
|
Value *NewLI;
|
|
if (CreateGather) {
|
|
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
|
|
"wide.masked.gather");
|
|
} else if (Mask) {
|
|
NewLI =
|
|
Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
|
|
PoisonValue::get(DataTy), "wide.masked.load");
|
|
} else {
|
|
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
|
|
}
|
|
applyMetadata(*cast<Instruction>(NewLI));
|
|
State.set(this, NewLI);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = load ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
|
|
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
|
|
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
|
|
bool CreateGather = !isConsecutive();
|
|
|
|
auto &Builder = State.Builder;
|
|
CallInst *NewLI;
|
|
Value *EVL = State.get(getEVL(), VPLane(0));
|
|
Value *Addr = State.get(getAddr(), !CreateGather);
|
|
Value *Mask = nullptr;
|
|
if (VPValue *VPMask = getMask())
|
|
Mask = State.get(VPMask);
|
|
else
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
if (CreateGather) {
|
|
NewLI =
|
|
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
|
|
nullptr, "wide.masked.gather");
|
|
} else {
|
|
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
|
|
{Addr, Mask, EVL}, nullptr, "vp.op.load");
|
|
}
|
|
NewLI->addParamAttr(
|
|
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
|
|
applyMetadata(*NewLI);
|
|
Instruction *Res = NewLI;
|
|
State.set(this, Res);
|
|
}
|
|
|
|
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
if (!Consecutive || IsMasked)
|
|
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
|
|
|
|
// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
|
|
// here because the EVL recipes using EVL to replace the tail mask. But in the
|
|
// legacy model, it will always calculate the cost of mask.
|
|
// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
|
|
// don't need to compare to the legacy cost model.
|
|
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
|
|
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
|
|
->getAddressSpace();
|
|
return Ctx.TTI.getMemIntrinsicInstrCost(
|
|
MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
|
|
Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = vp.load ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPWidenStoreRecipe::execute(VPTransformState &State) {
|
|
VPValue *StoredVPValue = getStoredValue();
|
|
bool CreateScatter = !isConsecutive();
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
Value *Mask = nullptr;
|
|
if (auto *VPMask = getMask())
|
|
Mask = State.get(VPMask);
|
|
|
|
Value *StoredVal = State.get(StoredVPValue);
|
|
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
|
|
Instruction *NewSI = nullptr;
|
|
if (CreateScatter)
|
|
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
|
|
else if (Mask)
|
|
NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
|
|
else
|
|
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
|
|
applyMetadata(*NewSI);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN store ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
|
|
VPValue *StoredValue = getStoredValue();
|
|
bool CreateScatter = !isConsecutive();
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
CallInst *NewSI = nullptr;
|
|
Value *StoredVal = State.get(StoredValue);
|
|
Value *EVL = State.get(getEVL(), VPLane(0));
|
|
Value *Mask = nullptr;
|
|
if (VPValue *VPMask = getMask())
|
|
Mask = State.get(VPMask);
|
|
else
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
Value *Addr = State.get(getAddr(), !CreateScatter);
|
|
if (CreateScatter) {
|
|
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
|
|
Intrinsic::vp_scatter,
|
|
{StoredVal, Addr, Mask, EVL});
|
|
} else {
|
|
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
|
|
Intrinsic::vp_store,
|
|
{StoredVal, Addr, Mask, EVL});
|
|
}
|
|
NewSI->addParamAttr(
|
|
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
|
|
applyMetadata(*NewSI);
|
|
}
|
|
|
|
InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
if (!Consecutive || IsMasked)
|
|
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
|
|
|
|
// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
|
|
// here because the EVL recipes using EVL to replace the tail mask. But in the
|
|
// legacy model, it will always calculate the cost of mask.
|
|
// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
|
|
// don't need to compare to the legacy cost model.
|
|
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
|
|
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
|
|
->getAddressSpace();
|
|
return Ctx.TTI.getMemIntrinsicInstrCost(
|
|
MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
|
|
Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN vp.store ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
|
|
VectorType *DstVTy, const DataLayout &DL) {
|
|
// Verify that V is a vector type with same number of elements as DstVTy.
|
|
auto VF = DstVTy->getElementCount();
|
|
auto *SrcVecTy = cast<VectorType>(V->getType());
|
|
assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
|
|
Type *SrcElemTy = SrcVecTy->getElementType();
|
|
Type *DstElemTy = DstVTy->getElementType();
|
|
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
|
|
"Vector elements must have same size");
|
|
|
|
// Do a direct cast if element types are castable.
|
|
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
|
|
return Builder.CreateBitOrPointerCast(V, DstVTy);
|
|
}
|
|
// V cannot be directly casted to desired vector type.
|
|
// May happen when V is a floating point vector but DstVTy is a vector of
|
|
// pointers or vice-versa. Handle this using a two-step bitcast using an
|
|
// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
|
|
assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
|
|
"Only one type should be a pointer type");
|
|
assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
|
|
"Only one type should be a floating point type");
|
|
Type *IntTy =
|
|
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
|
|
auto *VecIntTy = VectorType::get(IntTy, VF);
|
|
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
|
|
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
|
|
}
|
|
|
|
/// Return a vector containing interleaved elements from multiple
|
|
/// smaller input vectors.
|
|
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
|
|
const Twine &Name) {
|
|
unsigned Factor = Vals.size();
|
|
assert(Factor > 1 && "Tried to interleave invalid number of vectors");
|
|
|
|
VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
|
|
#ifndef NDEBUG
|
|
for (Value *Val : Vals)
|
|
assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
|
|
#endif
|
|
|
|
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
|
|
// must use intrinsics to interleave.
|
|
if (VecTy->isScalableTy()) {
|
|
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
|
|
return Builder.CreateVectorInterleave(Vals, Name);
|
|
}
|
|
|
|
// Fixed length. Start by concatenating all vectors into a wide vector.
|
|
Value *WideVec = concatenateVectors(Builder, Vals);
|
|
|
|
// Interleave the elements into the wide vector.
|
|
const unsigned NumElts = VecTy->getElementCount().getFixedValue();
|
|
return Builder.CreateShuffleVector(
|
|
WideVec, createInterleaveMask(NumElts, Factor), Name);
|
|
}
|
|
|
|
// Try to vectorize the interleave group that \p Instr belongs to.
|
|
//
|
|
// E.g. Translate following interleaved load group (factor = 3):
|
|
// for (i = 0; i < N; i+=3) {
|
|
// R = Pic[i]; // Member of index 0
|
|
// G = Pic[i+1]; // Member of index 1
|
|
// B = Pic[i+2]; // Member of index 2
|
|
// ... // do something to R, G, B
|
|
// }
|
|
// To:
|
|
// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
|
|
// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
|
|
// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
|
|
// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
|
|
//
|
|
// Or translate following interleaved store group (factor = 3):
|
|
// for (i = 0; i < N; i+=3) {
|
|
// ... do something to R, G, B
|
|
// Pic[i] = R; // Member of index 0
|
|
// Pic[i+1] = G; // Member of index 1
|
|
// Pic[i+2] = B; // Member of index 2
|
|
// }
|
|
// To:
|
|
// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
|
|
// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
|
|
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
|
|
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
|
|
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
|
|
void VPInterleaveRecipe::execute(VPTransformState &State) {
|
|
assert(!State.Lane && "Interleave group being replicated.");
|
|
assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
|
|
"Masking gaps for scalable vectors is not yet supported.");
|
|
const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
|
|
Instruction *Instr = Group->getInsertPos();
|
|
|
|
// Prepare for the vector type of the interleaved load/store.
|
|
Type *ScalarTy = getLoadStoreType(Instr);
|
|
unsigned InterleaveFactor = Group->getFactor();
|
|
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
|
|
|
|
VPValue *BlockInMask = getMask();
|
|
VPValue *Addr = getAddr();
|
|
Value *ResAddr = State.get(Addr, VPLane(0));
|
|
|
|
auto CreateGroupMask = [&BlockInMask, &State,
|
|
&InterleaveFactor](Value *MaskForGaps) -> Value * {
|
|
if (State.VF.isScalable()) {
|
|
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
|
|
assert(InterleaveFactor <= 8 &&
|
|
"Unsupported deinterleave factor for scalable vectors");
|
|
auto *ResBlockInMask = State.get(BlockInMask);
|
|
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
|
|
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
|
|
}
|
|
|
|
if (!BlockInMask)
|
|
return MaskForGaps;
|
|
|
|
Value *ResBlockInMask = State.get(BlockInMask);
|
|
Value *ShuffledMask = State.Builder.CreateShuffleVector(
|
|
ResBlockInMask,
|
|
createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
|
|
"interleaved.mask");
|
|
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
|
|
ShuffledMask, MaskForGaps)
|
|
: ShuffledMask;
|
|
};
|
|
|
|
const DataLayout &DL = Instr->getDataLayout();
|
|
// Vectorize the interleaved load group.
|
|
if (isa<LoadInst>(Instr)) {
|
|
Value *MaskForGaps = nullptr;
|
|
if (needsMaskForGaps()) {
|
|
MaskForGaps =
|
|
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
|
|
assert(MaskForGaps && "Mask for Gaps is required but it is null");
|
|
}
|
|
|
|
Instruction *NewLoad;
|
|
if (BlockInMask || MaskForGaps) {
|
|
Value *GroupMask = CreateGroupMask(MaskForGaps);
|
|
Value *PoisonVec = PoisonValue::get(VecTy);
|
|
NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
|
|
Group->getAlign(), GroupMask,
|
|
PoisonVec, "wide.masked.vec");
|
|
} else
|
|
NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
|
|
Group->getAlign(), "wide.vec");
|
|
applyMetadata(*NewLoad);
|
|
// TODO: Also manage existing metadata using VPIRMetadata.
|
|
Group->addMetadata(NewLoad);
|
|
|
|
ArrayRef<VPRecipeValue *> VPDefs = definedValues();
|
|
if (VecTy->isScalableTy()) {
|
|
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
|
|
// so must use intrinsics to deinterleave.
|
|
assert(InterleaveFactor <= 8 &&
|
|
"Unsupported deinterleave factor for scalable vectors");
|
|
NewLoad = State.Builder.CreateIntrinsic(
|
|
Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
|
|
NewLoad->getType(), NewLoad,
|
|
/*FMFSource=*/nullptr, "strided.vec");
|
|
}
|
|
|
|
auto CreateStridedVector = [&InterleaveFactor, &State,
|
|
&NewLoad](unsigned Index) -> Value * {
|
|
assert(Index < InterleaveFactor && "Illegal group index");
|
|
if (State.VF.isScalable())
|
|
return State.Builder.CreateExtractValue(NewLoad, Index);
|
|
|
|
// For fixed length VF, use shuffle to extract the sub-vectors from the
|
|
// wide load.
|
|
auto StrideMask =
|
|
createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
|
|
return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
|
|
"strided.vec");
|
|
};
|
|
|
|
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
|
|
Instruction *Member = Group->getMember(I);
|
|
|
|
// Skip the gaps in the group.
|
|
if (!Member)
|
|
continue;
|
|
|
|
Value *StridedVec = CreateStridedVector(I);
|
|
|
|
// If this member has different type, cast the result type.
|
|
if (Member->getType() != ScalarTy) {
|
|
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
|
|
StridedVec =
|
|
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
|
|
}
|
|
|
|
if (Group->isReverse())
|
|
StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
|
|
|
|
State.set(VPDefs[J], StridedVec);
|
|
++J;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// The sub vector type for current instruction.
|
|
auto *SubVT = VectorType::get(ScalarTy, State.VF);
|
|
|
|
// Vectorize the interleaved store group.
|
|
Value *MaskForGaps =
|
|
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
|
|
assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
|
|
"Mismatch between NeedsMaskForGaps and MaskForGaps");
|
|
ArrayRef<VPValue *> StoredValues = getStoredValues();
|
|
// Collect the stored vector from each member.
|
|
SmallVector<Value *, 4> StoredVecs;
|
|
unsigned StoredIdx = 0;
|
|
for (unsigned i = 0; i < InterleaveFactor; i++) {
|
|
assert((Group->getMember(i) || MaskForGaps) &&
|
|
"Fail to get a member from an interleaved store group");
|
|
Instruction *Member = Group->getMember(i);
|
|
|
|
// Skip the gaps in the group.
|
|
if (!Member) {
|
|
Value *Undef = PoisonValue::get(SubVT);
|
|
StoredVecs.push_back(Undef);
|
|
continue;
|
|
}
|
|
|
|
Value *StoredVec = State.get(StoredValues[StoredIdx]);
|
|
++StoredIdx;
|
|
|
|
if (Group->isReverse())
|
|
StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
|
|
|
|
// If this member has different type, cast it to a unified type.
|
|
|
|
if (StoredVec->getType() != SubVT)
|
|
StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
|
|
|
|
StoredVecs.push_back(StoredVec);
|
|
}
|
|
|
|
// Interleave all the smaller vectors into one wider vector.
|
|
Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
|
|
Instruction *NewStoreInstr;
|
|
if (BlockInMask || MaskForGaps) {
|
|
Value *GroupMask = CreateGroupMask(MaskForGaps);
|
|
NewStoreInstr = State.Builder.CreateMaskedStore(
|
|
IVec, ResAddr, Group->getAlign(), GroupMask);
|
|
} else
|
|
NewStoreInstr =
|
|
State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
|
|
|
|
applyMetadata(*NewStoreInstr);
|
|
// TODO: Also manage existing metadata using VPIRMetadata.
|
|
Group->addMetadata(NewStoreInstr);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
|
|
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
|
|
IG->getInsertPos()->printAsOperand(O, false);
|
|
O << ", ";
|
|
getAddr()->printAsOperand(O, SlotTracker);
|
|
VPValue *Mask = getMask();
|
|
if (Mask) {
|
|
O << ", ";
|
|
Mask->printAsOperand(O, SlotTracker);
|
|
}
|
|
|
|
unsigned OpIdx = 0;
|
|
for (unsigned i = 0; i < IG->getFactor(); ++i) {
|
|
if (!IG->getMember(i))
|
|
continue;
|
|
if (getNumStoreOperands() > 0) {
|
|
O << "\n" << Indent << " store ";
|
|
getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
|
|
O << " to index " << i;
|
|
} else {
|
|
O << "\n" << Indent << " ";
|
|
getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
|
|
O << " = load from index " << i;
|
|
}
|
|
++OpIdx;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
|
|
assert(!State.Lane && "Interleave group being replicated.");
|
|
assert(State.VF.isScalable() &&
|
|
"Only support scalable VF for EVL tail-folding.");
|
|
assert(!needsMaskForGaps() &&
|
|
"Masking gaps for scalable vectors is not yet supported.");
|
|
const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
|
|
Instruction *Instr = Group->getInsertPos();
|
|
|
|
// Prepare for the vector type of the interleaved load/store.
|
|
Type *ScalarTy = getLoadStoreType(Instr);
|
|
unsigned InterleaveFactor = Group->getFactor();
|
|
assert(InterleaveFactor <= 8 &&
|
|
"Unsupported deinterleave/interleave factor for scalable vectors");
|
|
ElementCount WideVF = State.VF * InterleaveFactor;
|
|
auto *VecTy = VectorType::get(ScalarTy, WideVF);
|
|
|
|
VPValue *Addr = getAddr();
|
|
Value *ResAddr = State.get(Addr, VPLane(0));
|
|
Value *EVL = State.get(getEVL(), VPLane(0));
|
|
Value *InterleaveEVL = State.Builder.CreateMul(
|
|
EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
|
|
/* NUW= */ true, /* NSW= */ true);
|
|
LLVMContext &Ctx = State.Builder.getContext();
|
|
|
|
Value *GroupMask = nullptr;
|
|
if (VPValue *BlockInMask = getMask()) {
|
|
SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
|
|
GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
|
|
} else {
|
|
GroupMask =
|
|
State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
|
|
}
|
|
|
|
// Vectorize the interleaved load group.
|
|
if (isa<LoadInst>(Instr)) {
|
|
CallInst *NewLoad = State.Builder.CreateIntrinsic(
|
|
VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
|
|
"wide.vp.load");
|
|
NewLoad->addParamAttr(0,
|
|
Attribute::getWithAlignment(Ctx, Group->getAlign()));
|
|
|
|
applyMetadata(*NewLoad);
|
|
// TODO: Also manage existing metadata using VPIRMetadata.
|
|
Group->addMetadata(NewLoad);
|
|
|
|
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
|
|
// so must use intrinsics to deinterleave.
|
|
NewLoad = State.Builder.CreateIntrinsic(
|
|
Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
|
|
NewLoad->getType(), NewLoad,
|
|
/*FMFSource=*/nullptr, "strided.vec");
|
|
|
|
const DataLayout &DL = Instr->getDataLayout();
|
|
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
|
|
Instruction *Member = Group->getMember(I);
|
|
// Skip the gaps in the group.
|
|
if (!Member)
|
|
continue;
|
|
|
|
Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
|
|
// If this member has different type, cast the result type.
|
|
if (Member->getType() != ScalarTy) {
|
|
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
|
|
StridedVec =
|
|
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
|
|
}
|
|
|
|
State.set(getVPValue(J), StridedVec);
|
|
++J;
|
|
}
|
|
return;
|
|
} // End for interleaved load.
|
|
|
|
// The sub vector type for current instruction.
|
|
auto *SubVT = VectorType::get(ScalarTy, State.VF);
|
|
// Vectorize the interleaved store group.
|
|
ArrayRef<VPValue *> StoredValues = getStoredValues();
|
|
// Collect the stored vector from each member.
|
|
SmallVector<Value *, 4> StoredVecs;
|
|
const DataLayout &DL = Instr->getDataLayout();
|
|
for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
|
|
Instruction *Member = Group->getMember(I);
|
|
// Skip the gaps in the group.
|
|
if (!Member) {
|
|
StoredVecs.push_back(PoisonValue::get(SubVT));
|
|
continue;
|
|
}
|
|
|
|
Value *StoredVec = State.get(StoredValues[StoredIdx]);
|
|
// If this member has different type, cast it to a unified type.
|
|
if (StoredVec->getType() != SubVT)
|
|
StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
|
|
|
|
StoredVecs.push_back(StoredVec);
|
|
++StoredIdx;
|
|
}
|
|
|
|
// Interleave all the smaller vectors into one wider vector.
|
|
Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
|
|
CallInst *NewStore =
|
|
State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
|
|
{IVec, ResAddr, GroupMask, InterleaveEVL});
|
|
NewStore->addParamAttr(1,
|
|
Attribute::getWithAlignment(Ctx, Group->getAlign()));
|
|
|
|
applyMetadata(*NewStore);
|
|
// TODO: Also manage existing metadata using VPIRMetadata.
|
|
Group->addMetadata(NewStore);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
|
|
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
|
|
IG->getInsertPos()->printAsOperand(O, false);
|
|
O << ", ";
|
|
getAddr()->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getEVL()->printAsOperand(O, SlotTracker);
|
|
if (VPValue *Mask = getMask()) {
|
|
O << ", ";
|
|
Mask->printAsOperand(O, SlotTracker);
|
|
}
|
|
|
|
unsigned OpIdx = 0;
|
|
for (unsigned i = 0; i < IG->getFactor(); ++i) {
|
|
if (!IG->getMember(i))
|
|
continue;
|
|
if (getNumStoreOperands() > 0) {
|
|
O << "\n" << Indent << " vp.store ";
|
|
getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
|
|
O << " to index " << i;
|
|
} else {
|
|
O << "\n" << Indent << " ";
|
|
getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
|
|
O << " = vp.load from index " << i;
|
|
}
|
|
++OpIdx;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
Instruction *InsertPos = getInsertPos();
|
|
// Find the VPValue index of the interleave group. We need to skip gaps.
|
|
unsigned InsertPosIdx = 0;
|
|
for (unsigned Idx = 0; IG->getFactor(); ++Idx)
|
|
if (auto *Member = IG->getMember(Idx)) {
|
|
if (Member == InsertPos)
|
|
break;
|
|
InsertPosIdx++;
|
|
}
|
|
Type *ValTy = Ctx.Types.inferScalarType(
|
|
getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
|
|
: getStoredValues()[InsertPosIdx]);
|
|
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
|
|
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
|
|
->getAddressSpace();
|
|
|
|
unsigned InterleaveFactor = IG->getFactor();
|
|
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
|
|
|
|
// Holds the indices of existing members in the interleaved group.
|
|
SmallVector<unsigned, 4> Indices;
|
|
for (unsigned IF = 0; IF < InterleaveFactor; IF++)
|
|
if (IG->getMember(IF))
|
|
Indices.push_back(IF);
|
|
|
|
// Calculate the cost of the whole interleaved group.
|
|
InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
|
|
InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
|
|
IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
|
|
|
|
if (!IG->isReverse())
|
|
return Cost;
|
|
|
|
return Cost + IG->getNumMembers() *
|
|
Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
|
|
VectorTy, VectorTy, {}, Ctx.CostKind,
|
|
0);
|
|
}
|
|
|
|
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
|
|
return vputils::onlyScalarValuesUsed(this) &&
|
|
(!IsScalable || vputils::onlyFirstLaneUsed(this));
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenPointerInductionRecipe::printRecipe(
|
|
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
|
|
assert((getNumOperands() == 3 || getNumOperands() == 5) &&
|
|
"unexpected number of operands");
|
|
O << Indent << "EMIT ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = WIDEN-POINTER-INDUCTION ";
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getOperand(2)->printAsOperand(O, SlotTracker);
|
|
if (getNumOperands() == 5) {
|
|
O << ", ";
|
|
getOperand(3)->printAsOperand(O, SlotTracker);
|
|
O << ", ";
|
|
getOperand(4)->printAsOperand(O, SlotTracker);
|
|
}
|
|
}
|
|
|
|
void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EMIT ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = EXPAND SCEV " << *Expr;
|
|
}
|
|
#endif
|
|
|
|
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
|
|
Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
|
|
Type *STy = CanonicalIV->getType();
|
|
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
|
|
ElementCount VF = State.VF;
|
|
Value *VStart = VF.isScalar()
|
|
? CanonicalIV
|
|
: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
|
|
Value *VStep = Builder.CreateElementCount(
|
|
STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
|
|
if (VF.isVector()) {
|
|
VStep = Builder.CreateVectorSplat(VF, VStep);
|
|
VStep =
|
|
Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
|
|
}
|
|
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
|
|
State.set(this, CanonicalVectorIV);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "EMIT ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = WIDEN-CANONICAL-INDUCTION ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
|
|
auto &Builder = State.Builder;
|
|
// Create a vector from the initial value.
|
|
auto *VectorInit = getStartValue()->getLiveInIRValue();
|
|
|
|
Type *VecTy = State.VF.isScalar()
|
|
? VectorInit->getType()
|
|
: VectorType::get(VectorInit->getType(), State.VF);
|
|
|
|
BasicBlock *VectorPH =
|
|
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
|
|
if (State.VF.isVector()) {
|
|
auto *IdxTy = Builder.getInt32Ty();
|
|
auto *One = ConstantInt::get(IdxTy, 1);
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
|
|
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
|
|
VectorInit = Builder.CreateInsertElement(
|
|
PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
|
|
}
|
|
|
|
// Create a phi node for the new recurrence.
|
|
PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
|
|
Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
|
|
Phi->addIncoming(VectorInit, VectorPH);
|
|
State.set(this, Phi);
|
|
}
|
|
|
|
InstructionCost
|
|
VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
if (VF.isScalar())
|
|
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPFirstOrderRecurrencePHIRecipe::printRecipe(
|
|
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPReductionPHIRecipe::execute(VPTransformState &State) {
|
|
// Reductions do not have to start at zero. They can start with
|
|
// any loop invariant values.
|
|
VPValue *StartVPV = getStartValue();
|
|
|
|
// In order to support recurrences we need to be able to vectorize Phi nodes.
|
|
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
|
|
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
|
|
// this value when we vectorize all of the instructions that use the PHI.
|
|
BasicBlock *VectorPH =
|
|
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
|
|
bool ScalarPHI = State.VF.isScalar() || isInLoop();
|
|
Value *StartV = State.get(StartVPV, ScalarPHI);
|
|
Type *VecTy = StartV->getType();
|
|
|
|
BasicBlock *HeaderBB = State.CFG.PrevBB;
|
|
assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
|
|
"recipe must be in the vector loop header");
|
|
auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
|
|
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
|
|
State.set(this, Phi, isInLoop());
|
|
|
|
Phi->addIncoming(StartV, VectorPH);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-REDUCTION-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi";
|
|
printFlags(O);
|
|
printOperands(O, SlotTracker);
|
|
if (getVFScaleFactor() > 1)
|
|
O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
|
|
}
|
|
#endif
|
|
|
|
bool VPBlendRecipe::usesFirstLaneOnly(const VPValue *Op) const {
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
return vputils::onlyFirstLaneUsed(this);
|
|
}
|
|
|
|
void VPWidenPHIRecipe::execute(VPTransformState &State) {
|
|
Value *Op0 = State.get(getOperand(0));
|
|
Type *VecTy = Op0->getType();
|
|
Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
|
|
State.set(this, VecPhi);
|
|
}
|
|
|
|
InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
|
|
VPCostContext &Ctx) const {
|
|
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "WIDEN-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi ";
|
|
printPhiOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
|
|
BasicBlock *VectorPH =
|
|
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
|
|
Value *StartMask = State.get(getOperand(0));
|
|
PHINode *Phi =
|
|
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
|
|
Phi->addIncoming(StartMask, VectorPH);
|
|
State.set(this, Phi);
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
|
|
VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "ACTIVE-LANE-MASK-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void VPCurrentIterationPHIRecipe::printRecipe(
|
|
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
|
|
O << Indent << "CURRENT-ITERATION-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
O << " = phi ";
|
|
printOperands(O, SlotTracker);
|
|
}
|
|
#endif
|