Files
llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Benjamin Maxwell f22a178b13 Reland "[LV] Support conditional scalar assignments of masked operations" (#180708)
This patch extends the support added in #158088 to loops where the
assignment is non-speculatable (e.g. a conditional load or divide).

For example, the following loop can now be vectorized:

```
int simple_csa_int_load(
  int* a, int* b, int default_val, int N, int threshold)
{
  int result = default_val;
  for (int i = 0; i < N; ++i)
    if (a[i] > threshold)
      result = b[i];
  return result;
}
```

It does this by extending the recurrence matching from only looking for
selects, to include phis where all operands are the header phi, except
for one which can be an arbitrary value outside the recurrence.

---

Reverts llvm/llvm-project#180275 (original PR: #178862)

Additional type legalization for `ISD::VECTOR_FIND_LAST_ACTIVE` was
added in #180290, which should resolve the backend crashes on x86.
2026-02-10 09:57:48 +00:00

4653 lines
173 KiB
C++

//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains implementations for different VPlan recipes.
///
//===----------------------------------------------------------------------===//
#include "LoopVectorizationPlanner.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanHelpers.h"
#include "VPlanPatternMatch.h"
#include "VPlanUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
using namespace llvm;
using namespace llvm::VPlanPatternMatch;
using VectorParts = SmallVector<Value *, 2>;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPRecipeID()) {
case VPExpressionSC:
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC: {
auto *VPI = cast<VPInstruction>(this);
// Loads read from memory but don't write to memory.
if (VPI->getOpcode() == Instruction::Load)
return false;
return VPI->opcodeMayReadOrWriteFromMemory();
}
case VPInterleaveEVLSC:
case VPInterleaveSC:
return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayWriteToMemory();
case VPWidenCallSC:
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyReadsMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
case VPActiveLaneMaskPHISC:
case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPScalarIVStepsSC:
case VPPredInstPHISC:
return false;
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
assert((!I || !I->mayWriteToMemory()) &&
"underlying instruction may write to memory");
return false;
}
default:
return true;
}
}
bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPRecipeID()) {
case VPExpressionSC:
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayReadFromMemory();
case VPWidenCallSC:
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyWritesMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
case VPBranchOnMaskSC:
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPPredInstPHISC:
case VPScalarIVStepsSC:
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return false;
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
assert((!I || !I->mayReadFromMemory()) &&
"underlying instruction may read from memory");
return false;
}
default:
// FIXME: Return false if the recipe represents an interleaved store.
return true;
}
}
bool VPRecipeBase::mayHaveSideEffects() const {
switch (getVPRecipeID()) {
case VPExpressionSC:
return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
case VPActiveLaneMaskPHISC:
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPPredInstPHISC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC: {
auto *VPI = cast<VPInstruction>(this);
return mayWriteToMemory() ||
VPI->getOpcode() == VPInstruction::BranchOnCount ||
VPI->getOpcode() == VPInstruction::BranchOnCond ||
VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
}
case VPWidenCallSC: {
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
}
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
assert((!I || !I->mayHaveSideEffects()) &&
"underlying instruction has side-effects");
return false;
}
case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
assert(
cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
mayWriteToMemory() &&
"mayHaveSideffects result for ingredient differs from this "
"implementation");
return mayWriteToMemory();
case VPReplicateSC: {
auto *R = cast<VPReplicateRecipe>(this);
return R->getUnderlyingInstr()->mayHaveSideEffects();
}
default:
return true;
}
}
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
assert(!Parent && "Recipe already in some VPBasicBlock");
assert(InsertPos->getParent() &&
"Insertion position not in any VPBasicBlock");
InsertPos->getParent()->insert(this, InsertPos->getIterator());
}
void VPRecipeBase::insertBefore(VPBasicBlock &BB,
iplist<VPRecipeBase>::iterator I) {
assert(!Parent && "Recipe already in some VPBasicBlock");
assert(I == BB.end() || I->getParent() == &BB);
BB.insert(this, I);
}
void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
assert(!Parent && "Recipe already in some VPBasicBlock");
assert(InsertPos->getParent() &&
"Insertion position not in any VPBasicBlock");
InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
}
void VPRecipeBase::removeFromParent() {
assert(getParent() && "Recipe not in any VPBasicBlock");
getParent()->getRecipeList().remove(getIterator());
Parent = nullptr;
}
iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
assert(getParent() && "Recipe not in any VPBasicBlock");
return getParent()->getRecipeList().erase(getIterator());
}
void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
removeFromParent();
insertAfter(InsertPos);
}
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
iplist<VPRecipeBase>::iterator I) {
removeFromParent();
insertBefore(BB, I);
}
InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
// Get the underlying instruction for the recipe, if there is one. It is used
// to
// * decide if cost computation should be skipped for this recipe,
// * apply forced target instruction cost.
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
InstructionCost RecipeCost;
if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
RecipeCost = 0;
} else {
RecipeCost = computeCost(VF, Ctx);
if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
RecipeCost.isValid()) {
if (UI)
RecipeCost = InstructionCost(ForceTargetInstructionCost);
else
RecipeCost = InstructionCost(0);
}
}
LLVM_DEBUG({
dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
dump();
});
return RecipeCost;
}
InstructionCost VPRecipeBase::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
llvm_unreachable("subclasses should implement computeCost");
}
bool VPRecipeBase::isPhi() const {
return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
isa<VPPhi, VPIRPhi>(this);
}
bool VPRecipeBase::isScalarCast() const {
auto *VPI = dyn_cast<VPInstruction>(this);
return VPI && Instruction::isCast(VPI->getOpcode());
}
void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
assert(OpType == Other.OpType && "OpType must match");
switch (OpType) {
case OperationType::OverflowingBinOp:
WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
break;
case OperationType::Trunc:
TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
break;
case OperationType::DisjointOp:
DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
break;
case OperationType::PossiblyExactOp:
ExactFlags.IsExact &= Other.ExactFlags.IsExact;
break;
case OperationType::GEPOp:
GEPFlags &= Other.GEPFlags;
break;
case OperationType::FPMathOp:
case OperationType::FCmp:
assert((OpType != OperationType::FCmp ||
FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
"Cannot drop CmpPredicate");
getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
break;
case OperationType::NonNegOp:
NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
break;
case OperationType::Cmp:
assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
break;
case OperationType::ReductionOp:
assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
"Cannot change RecurKind");
assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
"Cannot change IsOrdered");
assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
"Cannot change IsInLoop");
getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
break;
case OperationType::Other:
assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
break;
}
}
FastMathFlags VPIRFlags::getFastMathFlags() const {
assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
OpType == OperationType::ReductionOp) &&
"recipe doesn't have fast math flags");
const FastMathFlagsTy &F = getFMFsRef();
FastMathFlags Res;
Res.setAllowReassoc(F.AllowReassoc);
Res.setNoNaNs(F.NoNaNs);
Res.setNoInfs(F.NoInfs);
Res.setNoSignedZeros(F.NoSignedZeros);
Res.setAllowReciprocal(F.AllowReciprocal);
Res.setAllowContract(F.AllowContract);
Res.setApproxFunc(F.ApproxFunc);
return Res;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
printRecipe(O, Indent, SlotTracker);
if (auto DL = getDebugLoc()) {
O << ", !dbg ";
DL.print(O);
}
if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
Metadata->print(O, SlotTracker);
}
#endif
template <unsigned PartOpIdx>
VPValue *
VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
if (U.getNumOperands() == PartOpIdx + 1)
return U.getOperand(PartOpIdx);
return nullptr;
}
template <unsigned PartOpIdx>
unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
if (auto *UnrollPartOp = getUnrollPartOperand(U))
return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
return 0;
}
namespace llvm {
template class VPUnrollPartAccessor<1>;
template class VPUnrollPartAccessor<2>;
template class VPUnrollPartAccessor<3>;
}
VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
const VPIRFlags &Flags, const VPIRMetadata &MD,
DebugLoc DL, const Twine &Name)
: VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
assert(flagsValidForOpcode(getOpcode()) &&
"Set flags not supported for the provided opcode");
assert(hasRequiredFlagsForOpcode(getOpcode()) &&
"Opcode requires specific flags to be set");
assert((getNumOperandsForOpcode() == -1u ||
getNumOperandsForOpcode() == getNumOperands()) &&
"number of operands does not match opcode");
}
unsigned VPInstruction::getNumOperandsForOpcode() const {
if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
return 1;
if (Instruction::isBinaryOp(Opcode))
return 2;
switch (Opcode) {
case VPInstruction::StepVector:
case VPInstruction::VScale:
return 0;
case Instruction::Alloca:
case Instruction::ExtractValue:
case Instruction::Freeze:
case Instruction::Load:
case VPInstruction::BranchOnCond:
case VPInstruction::Broadcast:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::Not:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::ExtractElement:
case Instruction::Store:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnTwoConds:
case VPInstruction::ExitingIVValue:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
case VPInstruction::ActiveLaneMask:
case VPInstruction::ReductionStartVector:
case VPInstruction::ExtractLastActive:
return 3;
case Instruction::Call: {
// For unmasked calls, the last argument will the called function. Use that
// to compute the number of operands without the mask.
VPValue *LastOp = getOperand(getNumOperands() - 1);
if (isa<VPIRValue>(LastOp) && isa<Function>(LastOp->getLiveInIRValue()))
return getNumOperands();
return getNumOperands() - 1;
}
case Instruction::GetElementPtr:
case Instruction::PHI:
case Instruction::Switch:
case VPInstruction::AnyOf:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeReductionResult:
case VPInstruction::FirstActiveLane:
case VPInstruction::LastActiveLane:
case VPInstruction::SLPLoad:
case VPInstruction::SLPStore:
case VPInstruction::ExtractLane:
// Cannot determine the number of operands from the opcode.
return -1u;
}
llvm_unreachable("all cases should be handled above");
}
bool VPInstruction::doesGeneratePerAllLanes() const {
return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
}
bool VPInstruction::canGenerateScalarForFirstLane() const {
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
return true;
if (isSingleScalar() || isVectorToScalar())
return true;
switch (Opcode) {
case Instruction::Freeze:
case Instruction::ICmp:
case Instruction::PHI:
case Instruction::Select:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnTwoConds:
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::AnyOf:
case VPInstruction::Not:
return true;
default:
return false;
}
}
Value *VPInstruction::generate(VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
if (Instruction::isBinaryOp(getOpcode())) {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
auto *Res =
Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
if (auto *I = dyn_cast<Instruction>(Res))
applyFlags(*I);
return Res;
}
switch (getOpcode()) {
case VPInstruction::Not: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
return Builder.CreateNot(A, Name);
}
case Instruction::ExtractElement: {
assert(State.VF.isVector() && "Only extract elements from vectors");
if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
Value *Vec = State.get(getOperand(0));
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
}
case Instruction::Freeze: {
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
return Builder.CreateFreeze(Op, Name);
}
case Instruction::FCmp:
case Instruction::ICmp: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::PHI: {
llvm_unreachable("should be handled by VPPhi::execute");
}
case Instruction::Select: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *Cond =
State.get(getOperand(0),
OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
FastMathFlags FMFs =
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
return Builder.CreateSelectFMF(Cond, Op1, Op2, FMFs, Name);
}
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
// Get the original loop tripcount.
Value *ScalarTC = State.get(getOperand(1), VPLane(0));
// If this part of the active lane mask is scalar, generate the CMP directly
// to avoid unnecessary extracts.
if (State.VF.isScalar())
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
Name);
ElementCount EC = State.VF.multiplyCoefficientBy(
cast<VPConstantInt>(getOperand(2))->getZExtValue());
auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
// vector.ph:
// v_init = vector(..., ..., ..., a[-1])
// br vector.body
//
// vector.body
// i = phi [0, vector.ph], [i+4, vector.body]
// v1 = phi [v_init, vector.ph], [v2, vector.body]
// v2 = a[i, i+1, i+2, i+3];
// v3 = vector(v1(3), v2(0, 1, 2))
auto *V1 = State.get(getOperand(0));
if (!V1->getType()->isVectorTy())
return V1;
Value *V2 = State.get(getOperand(1));
return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
}
case VPInstruction::CalculateTripCountMinusVF: {
unsigned UF = getParent()->getPlan()->getUF();
Value *ScalarTC = State.get(getOperand(0), VPLane(0));
Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
Value *Sub = Builder.CreateSub(ScalarTC, Step);
Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
Value *Zero = ConstantInt::getNullValue(ScalarTC->getType());
return Builder.CreateSelect(Cmp, Sub, Zero);
}
case VPInstruction::ExplicitVectorLength: {
// TODO: Restructure this code with an explicit remainder loop, vsetvli can
// be outside of the main loop.
Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
// Compute EVL
assert(AVL->getType()->isIntegerTy() &&
"Requested vector length should be an integer.");
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
Value *EVL = Builder.CreateIntrinsic(
Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, Builder.getTrue()});
return EVL;
}
case VPInstruction::CanonicalIVIncrementForPart: {
unsigned Part = getUnrollPart(*this);
auto *IV = State.get(getOperand(0), VPLane(0));
assert(Part != 0 && "Must have a positive part");
// The canonical IV is incremented by the vectorization factor (num of
// SIMD elements) times the unroll part.
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
hasNoSignedWrap());
}
case VPInstruction::BranchOnCond: {
Value *Cond = State.get(getOperand(0), VPLane(0));
// Replace the temporary unreachable terminator with a new conditional
// branch, hooking it up to backward destination for latch blocks now, and
// to forward destination(s) later when they are created.
// Second successor may be backwards - iff it is already in VPBB2IRBB.
VPBasicBlock *SecondVPSucc =
cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
// First successor is always forward, reset it to nullptr.
Br->setSuccessor(0, nullptr);
IRBB->getTerminator()->eraseFromParent();
applyMetadata(*Br);
return Br;
}
case VPInstruction::Broadcast: {
return Builder.CreateVectorSplat(
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
}
case VPInstruction::BuildStructVector: {
// For struct types, we need to build a new 'wide' struct type, where each
// element is widened, i.e., we create a struct of vectors.
auto *StructTy =
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
for (const auto &[LaneIndex, Op] : enumerate(operands())) {
for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
FieldIndex++) {
Value *ScalarValue =
Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
VectorValue =
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
}
}
return Res;
}
case VPInstruction::BuildVector: {
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
auto NumOfElements = ElementCount::getFixed(getNumOperands());
Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
for (const auto &[Idx, Op] : enumerate(operands()))
Res = Builder.CreateInsertElement(Res, State.get(Op, true),
Builder.getInt32(Idx));
return Res;
}
case VPInstruction::ReductionStartVector: {
if (State.VF.isScalar())
return State.get(getOperand(0), true);
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
Builder.setFastMathFlags(getFastMathFlags());
// If this start vector is scaled then it should produce a vector with fewer
// elements than the VF.
ElementCount VF = State.VF.divideCoefficientBy(
cast<VPConstantInt>(getOperand(2))->getZExtValue());
auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
Builder.getInt32(0));
}
case VPInstruction::ComputeAnyOfResult: {
Value *Start = State.get(getOperand(0), VPLane(0));
Value *NewVal = State.get(getOperand(1), VPLane(0));
Value *ReducedResult = State.get(getOperand(2));
for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
ReducedResult =
Builder.CreateBinOp(Instruction::Or, State.get(getOperand(Idx)),
ReducedResult, "bin.rdx");
// If any predicate is true it means that we want to select the new value.
if (ReducedResult->getType()->isVectorTy())
ReducedResult = Builder.CreateOrReduce(ReducedResult);
// The compares in the loop may yield poison, which propagates through the
// bitwise ORs. Freeze it here before the condition is used.
ReducedResult = Builder.CreateFreeze(ReducedResult);
return Builder.CreateSelect(ReducedResult, NewVal, Start, "rdx.select");
}
case VPInstruction::ComputeReductionResult: {
RecurKind RK = getRecurKind();
bool IsOrdered = isReductionOrdered();
bool IsInLoop = isReductionInLoop();
assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
"FindIV should use min/max reduction kinds");
// The recipe may have multiple operands to be reduced together.
unsigned NumOperandsToReduce = getNumOperands();
VectorParts RdxParts(NumOperandsToReduce);
for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
if (hasFastMathFlags())
Builder.setFastMathFlags(getFastMathFlags());
// Reduce multiple operands into one.
Value *ReducedPartRdx = RdxParts[0];
if (IsOrdered) {
ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
} else {
// Floating-point operations should have some FMF to enable the reduction.
for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
Value *RdxPart = RdxParts[Part];
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
else {
// For sub-recurrences, each part's reduction variable is already
// negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
Instruction::BinaryOps Opcode =
RK == RecurKind::Sub
? Instruction::Add
: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
ReducedPartRdx =
Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
}
}
}
// Create the reduction after the loop. Note that inloop reductions create
// the target reduction in the loop using a Reduction recipe.
if (State.VF.isVector() && !IsInLoop) {
// TODO: Support in-order reductions based on the recurrence descriptor.
// All ops in the reduction inherit fast-math-flags from the recurrence
// descriptor.
ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
}
return ReducedPartRdx;
}
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
"invalid offset to extract from");
// Extract lane VF - Offset from the operand.
Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
} else {
// TODO: Remove ExtractLastLane for scalar VFs.
assert(Offset <= 1 && "invalid offset to extract from");
Res = State.get(getOperand(0));
}
if (isa<ExtractElementInst>(Res))
Res->setName(Name);
return Res;
}
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0));
Value *B = State.get(getOperand(1));
return Builder.CreateLogicalAnd(A, B, Name);
}
case VPInstruction::PtrAdd: {
assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Addend = State.get(getOperand(1), VPLane(0));
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::WidePtrAdd: {
Value *Ptr =
State.get(getOperand(0), vputils::isSingleScalar(getOperand(0)));
Value *Addend = State.get(getOperand(1));
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::AnyOf: {
Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
for (VPValue *Op : drop_begin(operands()))
Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
}
case VPInstruction::ExtractLane: {
assert(getNumOperands() != 2 && "ExtractLane from single source should be "
"simplified to ExtractElement.");
Value *LaneToExtract = State.get(getOperand(0), true);
Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
Value *Res = nullptr;
Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
Value *VectorStart =
Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
Value *VectorIdx = Idx == 1
? LaneToExtract
: Builder.CreateSub(LaneToExtract, VectorStart);
Value *Ext = State.VF.isScalar()
? State.get(getOperand(Idx))
: Builder.CreateExtractElement(
State.get(getOperand(Idx)), VectorIdx);
if (Res) {
Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
Res = Builder.CreateSelect(Cmp, Ext, Res);
} else {
Res = Ext;
}
}
return Res;
}
case VPInstruction::FirstActiveLane: {
if (getNumOperands() == 1) {
Value *Mask = State.get(getOperand(0));
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
/*ZeroIsPoison=*/false, Name);
}
// If there are multiple operands, create a chain of selects to pick the
// first operand with an active lane and add the number of lanes of the
// preceding operands.
Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt64Ty(), State.VF);
unsigned LastOpIdx = getNumOperands() - 1;
Value *Res = nullptr;
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
Value *TrailingZeros =
State.VF.isScalar()
? Builder.CreateZExt(
Builder.CreateICmpEQ(State.get(getOperand(Idx)),
Builder.getFalse()),
Builder.getInt64Ty())
: Builder.CreateCountTrailingZeroElems(
Builder.getInt64Ty(), State.get(getOperand(Idx)),
/*ZeroIsPoison=*/false, Name);
Value *Current = Builder.CreateAdd(
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
if (Res) {
Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
Res = Builder.CreateSelect(Cmp, Current, Res);
} else {
Res = Current;
}
}
return Res;
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
case VPInstruction::Reverse:
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
case VPInstruction::ExtractLastActive: {
Value *Data = State.get(getOperand(0));
Value *Mask = State.get(getOperand(1));
Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
Type *VTy = Data->getType();
return Builder.CreateIntrinsic(
Intrinsic::experimental_vector_extract_last_active, {VTy},
{Data, Mask, Default});
}
default:
llvm_unreachable("Unsupported opcode for instruction");
}
}
InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
Type *ScalarTy = Ctx.Types.inferScalarType(this);
Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
switch (Opcode) {
case Instruction::FNeg:
return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
TargetTransformInfo::OperandValueInfo RHSInfo = {
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
if (VF.isVector()) {
// Certain instructions can be cheaper to vectorize if they have a
// constant second vector operand. One example of this are shifts on x86.
VPValue *RHS = getOperand(1);
RHSInfo = Ctx.getOperandInfo(RHS);
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
getOperand(1)->isDefinedOutsideLoopRegions())
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
}
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
SmallVector<const Value *, 4> Operands;
if (CtxI)
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
return Ctx.TTI.getArithmeticInstrCost(
Opcode, ResultTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
RHSInfo, Operands, CtxI, &Ctx.TLI);
}
case Instruction::Freeze:
// This opcode is unknown. Assume that it is the same as 'mul'.
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
Ctx.CostKind);
case Instruction::ExtractValue:
return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
Ctx.CostKind);
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
return Ctx.TTI.getCmpSelInstrCost(
Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, CtxI);
}
case Instruction::BitCast: {
Type *ScalarTy = Ctx.Types.inferScalarType(this);
if (ScalarTy->isPointerTy())
return 0;
[[fallthrough]];
}
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::AddrSpaceCast: {
// Computes the CastContextHint from a recipe that may access memory.
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
// Only compute CCH for memory operations, matching the legacy model
// which only considers loads/stores for cast context hints.
auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
if (!isa<LoadInst, StoreInst>(UI))
return TTI::CastContextHint::None;
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
: TTI::CastContextHint::Normal;
}
const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
if (WidenMemoryRecipe == nullptr)
return TTI::CastContextHint::None;
if (VF.isScalar())
return TTI::CastContextHint::Normal;
if (!WidenMemoryRecipe->isConsecutive())
return TTI::CastContextHint::GatherScatter;
if (WidenMemoryRecipe->isReverse())
return TTI::CastContextHint::Reversed;
if (WidenMemoryRecipe->isMasked())
return TTI::CastContextHint::Masked;
return TTI::CastContextHint::Normal;
};
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
return nullptr;
return dyn_cast<VPRecipeBase>(*R->user_begin());
};
if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
if (match(Recipe, m_Reverse(m_VPValue())))
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}
// For Z/Sext, get the context from the operand.
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
Opcode == Instruction::FPExt) {
if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
Recipe = ReverseOp->getDefiningRecipe();
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}
auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
// Arm TTI will use the underlying instruction to determine the cost.
return Ctx.TTI.getCastInstrCost(
Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
}
case Instruction::Select: {
SelectInst *SI = cast_or_null<SelectInst>(getUnderlyingValue());
bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
VPValue *Op0, *Op1;
bool IsLogicalAnd =
match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
// Also match the inverted forms:
// select x, false, y --> !x & y (still AND)
// select x, y, true --> !x | y (still OR)
IsLogicalAnd |=
match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
IsLogicalOr |=
match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
(IsLogicalAnd || IsLogicalOr)) {
// select x, y, false --> x & y
// select x, true, y --> x | y
const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
SmallVector<const Value *, 2> Operands;
if (SI && all_of(operands(),
[](VPValue *Op) { return Op->getUnderlyingValue(); }))
append_range(Operands, SI->operands());
return Ctx.TTI.getArithmeticInstrCost(
IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
}
Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
if (!IsScalarCond)
CondTy = VectorType::get(CondTy, VF);
llvm::CmpPredicate Pred;
if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
Pred = Cmp->getPredicate();
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getCmpSelInstrCost(
Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
}
}
llvm_unreachable("called for unsupported opcode");
}
InstructionCost VPInstruction::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
if (Instruction::isBinaryOp(getOpcode())) {
if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
// TODO: Compute cost for VPInstructions without underlying values once
// the legacy cost model has been retired.
return 0;
}
assert(!doesGeneratePerAllLanes() &&
"Should only generate a vector value or single scalar, not scalars "
"for all lanes.");
return getCostForRecipeWithOpcode(
getOpcode(),
vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
}
switch (getOpcode()) {
case Instruction::Select: {
llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
if (!vputils::onlyFirstLaneUsed(this)) {
CondTy = toVectorTy(CondTy, VF);
VecTy = toVectorTy(VecTy, VF);
}
return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
Ctx.CostKind);
}
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
if (VF.isScalar()) {
// ExtractLane with VF=1 takes care of handling extracting across multiple
// parts.
return 0;
}
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
Ctx.CostKind);
}
case VPInstruction::AnyOf: {
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
if (VF.isScalar())
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy),
CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index.
auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::LastActiveLane: {
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
if (VF.isScalar())
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy),
CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
// Add cost of NOT operation on the predicate.
Cost += Ctx.TTI.getArithmeticInstrCost(
Instruction::Xor, PredTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
{TargetTransformInfo::OK_UniformConstantValue,
TargetTransformInfo::OP_None});
// Add cost of SUB operation on the index.
Cost += Ctx.TTI.getArithmeticInstrCost(
Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
return Cost;
}
case VPInstruction::ExtractLastActive: {
Type *ScalarTy = Ctx.Types.inferScalarType(this);
Type *VecTy = toVectorTy(ScalarTy, VF);
Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
IntrinsicCostAttributes ICA(
Intrinsic::experimental_vector_extract_last_active, ScalarTy,
{VecTy, MaskTy, ScalarTy});
return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
cast<VectorType>(VectorTy),
cast<VectorType>(VectorTy), Mask,
Ctx.CostKind, VF.getKnownMinValue() - 1);
}
case VPInstruction::ActiveLaneMask: {
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
{ArgTy, ArgTy});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::ExplicitVectorLength: {
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
auto *VectorTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, /*Mask=*/{}, Ctx.CostKind,
/*Index=*/0);
}
case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
VecTy, Ctx.CostKind, 0);
}
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
[[fallthrough]];
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
assert(!getUnderlyingValue() &&
"unexpected VPInstruction witht underlying value");
return 0;
}
}
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastLane ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::LastActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
switch (getOpcode()) {
case Instruction::PHI:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::VScale:
return true;
default:
return isScalarCast();
}
}
void VPInstruction::execute(VPTransformState &State) {
assert(!isMasked() && "cannot execute masked VPInstruction");
assert(!State.Lane && "VPInstruction executing an Lane");
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
assert(flagsValidForOpcode(getOpcode()) &&
"Set flags not supported for the provided opcode");
assert(hasRequiredFlagsForOpcode(getOpcode()) &&
"Opcode requires specific flags to be set");
if (hasFastMathFlags())
State.Builder.setFastMathFlags(getFastMathFlags());
Value *GeneratedValue = generate(State);
if (!hasResult())
return;
assert(GeneratedValue && "generate must produce a value");
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
isVectorToScalar() || isSingleScalar());
assert((((GeneratedValue->getType()->isVectorTy() ||
GeneratedValue->getType()->isStructTy()) ==
!GeneratesPerFirstLaneOnly) ||
State.VF.isScalar()) &&
"scalar value but not only first lane defined");
State.set(this, GeneratedValue,
/*IsScalar*/ GeneratesPerFirstLaneOnly);
}
bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
return false;
switch (getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::ExtractElement:
case Instruction::Freeze:
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
case Instruction::PHI:
case VPInstruction::AnyOf:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnTwoConds:
case VPInstruction::BranchOnCount:
case VPInstruction::Broadcast:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::ExitingIVValue:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::FirstActiveLane:
case VPInstruction::LastActiveLane:
case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
case VPInstruction::PtrAdd:
case VPInstruction::WideIVStep:
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::Reverse:
case VPInstruction::VScale:
case VPInstruction::Unpack:
return false;
default:
return true;
}
}
bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
return vputils::onlyFirstLaneUsed(this);
switch (getOpcode()) {
default:
return false;
case Instruction::ExtractElement:
return Op == getOperand(1);
case Instruction::PHI:
return true;
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
case Instruction::Freeze:
case VPInstruction::Not:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::Broadcast:
case VPInstruction::ReductionStartVector:
return true;
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
// Before replicating by VF, Build(Struct)Vector uses all lanes of the
// operand, after replicating its operands only the first lane is used.
// Before replicating, it will have only a single operand.
return getNumOperands() > 1;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::WidePtrAdd:
// WidePtrAdd supports scalar and vector base addresses.
return false;
case VPInstruction::ComputeAnyOfResult:
return Op == getOperand(0) || Op == getOperand(1);
case VPInstruction::ExitingIVValue:
case VPInstruction::ExtractLane:
return Op == getOperand(0);
};
llvm_unreachable("switch should return");
}
bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()))
return vputils::onlyFirstPartUsed(this);
switch (getOpcode()) {
default:
return false;
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
return vputils::onlyFirstPartUsed(this);
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnTwoConds:
case VPInstruction::CanonicalIVIncrementForPart:
return true;
};
llvm_unreachable("switch should return");
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInstruction::dump() const {
VPSlotTracker SlotTracker(getParent()->getPlan());
printRecipe(dbgs(), "", SlotTracker);
}
void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
if (hasResult()) {
printAsOperand(O, SlotTracker);
O << " = ";
}
switch (getOpcode()) {
case VPInstruction::Not:
O << "not";
break;
case VPInstruction::SLPLoad:
O << "combined load";
break;
case VPInstruction::SLPStore:
O << "combined store";
break;
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
case VPInstruction::ExplicitVectorLength:
O << "EXPLICIT-VECTOR-LENGTH";
break;
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
case VPInstruction::BranchOnCond:
O << "branch-on-cond";
break;
case VPInstruction::BranchOnTwoConds:
O << "branch-on-two-conds";
break;
case VPInstruction::CalculateTripCountMinusVF:
O << "TC > VF ? TC - VF : 0";
break;
case VPInstruction::CanonicalIVIncrementForPart:
O << "VF * Part +";
break;
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
case VPInstruction::Broadcast:
O << "broadcast";
break;
case VPInstruction::BuildStructVector:
O << "buildstructvector";
break;
case VPInstruction::BuildVector:
O << "buildvector";
break;
case VPInstruction::ExitingIVValue:
O << "exiting-iv-value";
break;
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
case VPInstruction::ExtractLastLane:
O << "extract-last-lane";
break;
case VPInstruction::ExtractLastPart:
O << "extract-last-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
case VPInstruction::ComputeAnyOfResult:
O << "compute-anyof-result";
break;
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
case VPInstruction::LogicalAnd:
O << "logical-and";
break;
case VPInstruction::PtrAdd:
O << "ptradd";
break;
case VPInstruction::WidePtrAdd:
O << "wide-ptradd";
break;
case VPInstruction::AnyOf:
O << "any-of";
break;
case VPInstruction::FirstActiveLane:
O << "first-active-lane";
break;
case VPInstruction::LastActiveLane:
O << "last-active-lane";
break;
case VPInstruction::ReductionStartVector:
O << "reduction-start-vector";
break;
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
case VPInstruction::Reverse:
O << "reverse";
break;
case VPInstruction::Unpack:
O << "unpack";
break;
case VPInstruction::ExtractLastActive:
O << "extract-last-active";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
printFlags(O);
printOperands(O, SlotTracker);
}
#endif
void VPInstructionWithType::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
if (isScalarCast()) {
Value *Op = State.get(getOperand(0), VPLane(0));
Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
Op, ResultTy);
State.set(this, Cast, VPLane(0));
return;
}
switch (getOpcode()) {
case VPInstruction::StepVector: {
Value *StepVector =
State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
State.set(this, StepVector);
break;
}
case VPInstruction::VScale: {
Value *VScale = State.Builder.CreateVScale(ResultTy);
State.set(this, VScale, true);
break;
}
default:
llvm_unreachable("opcode not implemented yet");
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
printAsOperand(O, SlotTracker);
O << " = ";
switch (getOpcode()) {
case VPInstruction::WideIVStep:
O << "wide-iv-step ";
printOperands(O, SlotTracker);
break;
case VPInstruction::StepVector:
O << "step-vector " << *ResultTy;
break;
case VPInstruction::VScale:
O << "vscale " << *ResultTy;
break;
default:
assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
O << Instruction::getOpcodeName(getOpcode()) << " ";
printOperands(O, SlotTracker);
O << " to " << *ResultTy;
}
}
#endif
void VPPhi::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
PHINode *NewPhi = State.Builder.CreatePHI(
State.TypeAnalysis.inferScalarType(this), 2, getName());
unsigned NumIncoming = getNumIncoming();
if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
// TODO: Fixup all incoming values of header phis once recipes defining them
// are introduced.
NumIncoming = 1;
}
for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
NewPhi->addIncoming(IncV, PredBB);
}
State.set(this, NewPhi, VPLane(0));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
printAsOperand(O, SlotTracker);
O << " = phi";
printFlags(O);
printPhiOperands(O, SlotTracker);
}
#endif
VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
if (auto *Phi = dyn_cast<PHINode>(&I))
return new VPIRPhi(*Phi);
return new VPIRInstruction(I);
}
void VPIRInstruction::execute(VPTransformState &State) {
assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
"PHINodes must be handled by VPIRPhi");
// Advance the insert point after the wrapped IR instruction. This allows
// interleaving VPIRInstructions and other recipes.
State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
}
InstructionCost VPIRInstruction::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// The recipe wraps an existing IR instruction on the border of VPlan's scope,
// hence it does not contribute to the cost-modeling for the VPlan.
return 0;
}
void VPIRInstruction::extractLastLaneOfLastPartOfFirstOperand(
VPBuilder &Builder) {
assert(isa<PHINode>(getInstruction()) &&
"can only update exiting operands to phi nodes");
assert(getNumOperands() > 0 && "must have at least one operand");
VPValue *Exiting = getOperand(0);
if (isa<VPIRValue>(Exiting))
return;
Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
setOperand(0, Exiting);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "IR " << I;
}
#endif
void VPIRPhi::execute(VPTransformState &State) {
PHINode *Phi = &getIRPhi();
for (const auto &[Idx, Op] : enumerate(operands())) {
VPValue *ExitValue = Op;
auto Lane = vputils::isSingleScalar(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
State.Builder.SetInsertPoint(PredBB->getTerminator());
Value *V = State.get(ExitValue, VPLane(Lane));
// If there is no existing block for PredBB in the phi, add a new incoming
// value. Otherwise update the existing incoming value for PredBB.
if (Phi->getBasicBlockIndex(PredBB) == -1)
Phi->addIncoming(V, PredBB);
else
Phi->setIncomingValueForBlock(PredBB, V);
}
// Advance the insert point after the wrapped IR instruction. This allows
// interleaving VPIRInstructions and other recipes.
State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
}
void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
"Number of phi operands must match number of predecessors");
unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
R->removeOperand(Position);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhiAccessors::printPhiOperands(raw_ostream &O,
VPSlotTracker &SlotTracker) const {
interleaveComma(enumerate(getAsRecipe()->operands()), O,
[this, &O, &SlotTracker](auto Op) {
O << "[ ";
Op.value()->printAsOperand(O, SlotTracker);
O << ", ";
getIncomingBlock(Op.index())->printAsOperand(O);
O << " ]";
});
}
#endif
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
VPIRInstruction::printRecipe(O, Indent, SlotTracker);
if (getNumOperands() != 0) {
O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
interleaveComma(incoming_values_and_blocks(), O,
[&O, &SlotTracker](auto Op) {
std::get<0>(Op)->printAsOperand(O, SlotTracker);
O << " from ";
std::get<1>(Op)->printAsOperand(O);
});
O << ")";
}
}
#endif
void VPIRMetadata::applyMetadata(Instruction &I) const {
for (const auto &[Kind, Node] : Metadata)
I.setMetadata(Kind, Node);
}
void VPIRMetadata::intersect(const VPIRMetadata &Other) {
SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
for (const auto &[KindA, MDA] : Metadata) {
for (const auto &[KindB, MDB] : Other.Metadata) {
if (KindA == KindB && MDA == MDB) {
MetadataIntersection.emplace_back(KindA, MDA);
break;
}
}
}
Metadata = std::move(MetadataIntersection);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
const Module *M = SlotTracker.getModule();
if (Metadata.empty() || !M)
return;
ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
O << " (";
interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
auto [Kind, Node] = KindNodePair;
assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
"Unexpected unnamed metadata kind");
O << "!" << MDNames[Kind] << " ";
Node->printAsOperand(O, M);
});
O << ")";
}
#endif
void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
assert(Variant != nullptr && "Can't create vector function.");
FunctionType *VFTy = Variant->getFunctionType();
// Add return type if intrinsic is overloaded on it.
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(args())) {
Value *Arg;
// Some vectorized function variants may also take a scalar argument,
// e.g. linear parameters for pointers. This needs to be the scalar value
// from the start of the respective part when interleaving.
if (!VFTy->getParamType(I.index())->isVectorTy())
Arg = State.get(I.value(), VPLane(0));
else
Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
Args.push_back(Arg);
}
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
applyFlags(*V);
applyMetadata(*V);
V->setCallingConv(Variant->getCallingConv());
if (!V->getType()->isVoidTy())
State.set(this, V);
}
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
Variant->getFunctionType()->params(),
Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CALL ";
Function *CalledFn = getCalledScalarFunction();
if (CalledFn->getReturnType()->isVoidTy())
O << "void ";
else {
printAsOperand(O, SlotTracker);
O << " = ";
}
O << "call";
printFlags(O);
O << " @" << CalledFn->getName() << "(";
interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
O << ")";
O << " (using library function";
if (Variant->hasName())
O << ": " << Variant->getName();
O << ")";
}
#endif
void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
State.TTI)) {
Type *RetTy = toVectorizedTy(getResultType(), State.VF);
ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
for (auto [Idx, Ty] : enumerate(ContainedTys)) {
if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID,
Idx, State.TTI))
TysForDecl.push_back(Ty);
}
}
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
State.TTI))
Arg = State.get(I.value(), VPLane(0));
else
Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
State.TTI))
TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
}
// Use vector version of the intrinsic.
Module *M = State.Builder.GetInsertBlock()->getModule();
Function *VectorF =
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF &&
"Can't retrieve vector intrinsic or vector-predication intrinsics.");
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
applyFlags(*V);
applyMetadata(*V);
if (!V->getType()->isVoidTy())
State.set(this, V);
}
/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
ArrayRef<const VPValue *> Operands,
const VPRecipeWithIRFlags &R,
ElementCount VF,
VPCostContext &Ctx) {
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
for (const auto &[Idx, Op] : enumerate(Operands)) {
auto *V = Op->getUnderlyingValue();
if (!V) {
if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
Arguments.push_back(UI->getArgOperand(Idx));
continue;
}
Arguments.clear();
break;
}
Arguments.push_back(V);
}
Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
for (const VPValue *Op : Operands) {
ParamTys.push_back(VF.isVector()
? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
: Ctx.Types.inferScalarType(Op));
}
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
FastMathFlags FMF =
R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
SmallVector<const VPValue *> ArgOps(operands());
return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
}
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
return all_of(enumerate(operands()), [this, &Op](const auto &X) {
auto [Idx, V] = X;
return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
Idx, nullptr);
});
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-INTRINSIC ";
if (ResultTy->isVoidTy()) {
O << "void ";
} else {
printAsOperand(O, SlotTracker);
O << " = ";
}
O << "call";
printFlags(O);
O << getIntrinsicName() << "(";
interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
O << ")";
}
#endif
void VPHistogramRecipe::execute(VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
Value *Address = State.get(getOperand(0));
Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
VectorType *VTy = cast<VectorType>(Address->getType());
// The histogram intrinsic requires a mask even if the recipe doesn't;
// if the mask operand was omitted then all lanes should be executed and
// we just need to synthesize an all-true mask.
Value *Mask = nullptr;
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
else
Mask =
Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
// If this is a subtract, we want to invert the increment amount. We may
// add a separate intrinsic in future, but for now we'll try this.
if (Opcode == Instruction::Sub)
IncAmt = Builder.CreateNeg(IncAmt);
else
assert(Opcode == Instruction::Add && "only add or sub supported for now");
State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
{VTy, IncAmt->getType()},
{Address, IncAmt, Mask});
}
InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// FIXME: Take the gather and scatter into account as well. For now we're
// generating the same cost as the fallback path, but we'll likely
// need to create a new TTI method for determining the cost, including
// whether we can use base + vec-of-smaller-indices or just
// vec-of-pointers.
assert(VF.isVector() && "Invalid VF for histogram cost");
Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
VPValue *IncAmt = getOperand(1);
Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
VectorType *VTy = VectorType::get(IncTy, VF);
// Assume that a non-constant update value (or a constant != 1) requires
// a multiply, and add that into the cost.
InstructionCost MulCost =
Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
if (auto *CI = dyn_cast<VPConstantInt>(IncAmt))
if (CI->isOne())
MulCost = TTI::TCC_Free;
// Find the cost of the histogram operation itself.
Type *PtrTy = VectorType::get(AddressTy, VF);
Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
Type::getVoidTy(Ctx.LLVMCtx),
{PtrTy, IncTy, MaskTy});
// Add the costs together with the add/sub operation.
return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-HISTOGRAM buckets: ";
getOperand(0)->printAsOperand(O, SlotTracker);
if (Opcode == Instruction::Sub)
O << ", dec: ";
else {
assert(Opcode == Instruction::Add);
O << ", inc: ";
}
getOperand(1)->printAsOperand(O, SlotTracker);
if (VPValue *Mask = getMask()) {
O << ", mask: ";
Mask->printAsOperand(O, SlotTracker);
}
}
#endif
VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
AllowReassoc = FMF.allowReassoc();
NoNaNs = FMF.noNaNs();
NoInfs = FMF.noInfs();
NoSignedZeros = FMF.noSignedZeros();
AllowReciprocal = FMF.allowReciprocal();
AllowContract = FMF.allowContract();
ApproxFunc = FMF.approxFunc();
}
VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
switch (Opcode) {
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::Shl:
case VPInstruction::CanonicalIVIncrementForPart:
return WrapFlagsTy(false, false);
case Instruction::Trunc:
return TruncFlagsTy(false, false);
case Instruction::Or:
return DisjointFlagsTy(false);
case Instruction::AShr:
case Instruction::LShr:
case Instruction::UDiv:
case Instruction::SDiv:
return ExactFlagsTy(false);
case Instruction::GetElementPtr:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
return GEPNoWrapFlags::none();
case Instruction::ZExt:
case Instruction::UIToFP:
return NonNegFlagsTy(false);
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::FNeg:
case Instruction::FPExt:
case Instruction::FPTrunc:
return FastMathFlags();
case Instruction::ICmp:
case Instruction::FCmp:
case VPInstruction::ComputeReductionResult:
llvm_unreachable("opcode requires explicit flags");
default:
return VPIRFlags();
}
}
#if !defined(NDEBUG)
bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
switch (OpType) {
case OperationType::OverflowingBinOp:
return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
case OperationType::Trunc:
return Opcode == Instruction::Trunc;
case OperationType::DisjointOp:
return Opcode == Instruction::Or;
case OperationType::PossiblyExactOp:
return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
case OperationType::GEPOp:
return Opcode == Instruction::GetElementPtr ||
Opcode == VPInstruction::PtrAdd ||
Opcode == VPInstruction::WidePtrAdd;
case OperationType::FPMathOp:
return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
Opcode == Instruction::Select ||
Opcode == VPInstruction::WideIVStep ||
Opcode == VPInstruction::ReductionStartVector;
case OperationType::FCmp:
return Opcode == Instruction::FCmp;
case OperationType::NonNegOp:
return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
case OperationType::Cmp:
return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
case OperationType::ReductionOp:
return Opcode == VPInstruction::ComputeReductionResult;
case OperationType::Other:
return true;
}
llvm_unreachable("Unknown OperationType enum");
}
bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
// Handle opcodes without default flags.
if (Opcode == Instruction::ICmp)
return OpType == OperationType::Cmp;
if (Opcode == Instruction::FCmp)
return OpType == OperationType::FCmp;
if (Opcode == VPInstruction::ComputeReductionResult)
return OpType == OperationType::ReductionOp;
OperationType Required = getDefaultFlags(Opcode).OpType;
return Required == OperationType::Other || Required == OpType;
}
#endif
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPIRFlags::printFlags(raw_ostream &O) const {
switch (OpType) {
case OperationType::Cmp:
O << " " << CmpInst::getPredicateName(getPredicate());
break;
case OperationType::FCmp:
O << " " << CmpInst::getPredicateName(getPredicate());
getFastMathFlags().print(O);
break;
case OperationType::DisjointOp:
if (DisjointFlags.IsDisjoint)
O << " disjoint";
break;
case OperationType::PossiblyExactOp:
if (ExactFlags.IsExact)
O << " exact";
break;
case OperationType::OverflowingBinOp:
if (WrapFlags.HasNUW)
O << " nuw";
if (WrapFlags.HasNSW)
O << " nsw";
break;
case OperationType::Trunc:
if (TruncFlags.HasNUW)
O << " nuw";
if (TruncFlags.HasNSW)
O << " nsw";
break;
case OperationType::FPMathOp:
getFastMathFlags().print(O);
break;
case OperationType::GEPOp:
if (GEPFlags.isInBounds())
O << " inbounds";
else if (GEPFlags.hasNoUnsignedSignedWrap())
O << " nusw";
if (GEPFlags.hasNoUnsignedWrap())
O << " nuw";
break;
case OperationType::NonNegOp:
if (NonNegFlags.NonNeg)
O << " nneg";
break;
case OperationType::ReductionOp: {
RecurKind RK = getRecurKind();
O << " (";
switch (RK) {
case RecurKind::AnyOf:
O << "any-of";
break;
case RecurKind::SMax:
O << "smax";
break;
case RecurKind::SMin:
O << "smin";
break;
case RecurKind::UMax:
O << "umax";
break;
case RecurKind::UMin:
O << "umin";
break;
case RecurKind::FMinNum:
O << "fminnum";
break;
case RecurKind::FMaxNum:
O << "fmaxnum";
break;
case RecurKind::FMinimum:
O << "fminimum";
break;
case RecurKind::FMaximum:
O << "fmaximum";
break;
case RecurKind::FMinimumNum:
O << "fminimumnum";
break;
case RecurKind::FMaximumNum:
O << "fmaximumnum";
break;
default:
O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
break;
}
if (isReductionInLoop())
O << ", in-loop";
if (isReductionOrdered())
O << ", ordered";
O << ")";
getFastMathFlags().print(O);
break;
}
case OperationType::Other:
break;
}
O << " ";
}
#endif
void VPWidenRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
switch (Opcode) {
case Instruction::Call:
case Instruction::Br:
case Instruction::PHI:
case Instruction::GetElementPtr:
llvm_unreachable("This instruction is handled by a different recipe.");
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::FNeg:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
SmallVector<Value *, 2> Ops;
for (VPValue *VPOp : operands())
Ops.push_back(State.get(VPOp));
Value *V = Builder.CreateNAryOp(Opcode, Ops);
if (auto *VecOp = dyn_cast<Instruction>(V)) {
applyFlags(*VecOp);
applyMetadata(*VecOp);
}
// Use this vector value for all users of the original instruction.
State.set(this, V);
break;
}
case Instruction::ExtractValue: {
assert(getNumOperands() == 2 && "expected single level extractvalue");
Value *Op = State.get(getOperand(0));
Value *Extract = Builder.CreateExtractValue(
Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
State.set(this, Extract);
break;
}
case Instruction::Freeze: {
Value *Op = State.get(getOperand(0));
Value *Freeze = Builder.CreateFreeze(Op);
State.set(this, Freeze);
break;
}
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
bool FCmp = Opcode == Instruction::FCmp;
Value *A = State.get(getOperand(0));
Value *B = State.get(getOperand(1));
Value *C = nullptr;
if (FCmp) {
C = Builder.CreateFCmp(getPredicate(), A, B);
} else {
C = Builder.CreateICmp(getPredicate(), A, B);
}
if (auto *I = dyn_cast<Instruction>(C)) {
applyFlags(*I);
applyMetadata(*I);
}
State.set(this, C);
break;
}
case Instruction::Select: {
VPValue *CondOp = getOperand(0);
Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
Value *Op0 = State.get(getOperand(1));
Value *Op1 = State.get(getOperand(2));
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
State.set(this, Sel);
if (auto *I = dyn_cast<Instruction>(Sel)) {
if (isa<FPMathOperator>(I))
applyFlags(*I);
applyMetadata(*I);
}
break;
}
default:
// This instruction is not vectorized by simple widening.
LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
<< Instruction::getOpcodeName(Opcode));
llvm_unreachable("Unhandled instruction!");
} // end of switch.
#if !defined(NDEBUG)
// Verify that VPlan type inference results agree with the type of the
// generated values.
assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
State.get(this)->getType() &&
"inferred type and type from generated instructions do not match");
#endif
}
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
switch (Opcode) {
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
// If the div/rem operation isn't safe to speculate and requires
// predication, then the only way we can even create a vplan is to insert
// a select on the second input operand to ensure we use the value of 1
// for the inactive lanes. The select will be costed separately.
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::Freeze:
case Instruction::ExtractValue:
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Select:
return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
default:
llvm_unreachable("Unsupported opcode for instruction");
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
printOperands(O, SlotTracker);
}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
/// Vectorize casts.
assert(State.VF.isVector() && "Not vectorizing?");
Type *DestTy = VectorType::get(getResultType(), State.VF);
VPValue *Op = getOperand(0);
Value *A = State.get(Op);
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
State.set(this, Cast);
if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
applyFlags(*CastOp);
applyMetadata(*CastOp);
}
}
InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// TODO: In some cases, VPWidenCastRecipes are created but not considered in
// the legacy cost model, including truncates/extends when evaluating a
// reduction in a smaller type.
if (!getUnderlyingValue())
return 0;
return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CAST ";
printAsOperand(O, SlotTracker);
O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
printOperands(O, SlotTracker);
O << " to " << *getResultType();
}
#endif
InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::printRecipe(
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = WIDEN-INDUCTION";
printFlags(O);
printOperands(O, SlotTracker);
if (auto *TI = getTruncInst())
O << " (truncated to " << *TI->getType() << ")";
}
#endif
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
// The step may be defined by a recipe in the preheader (e.g. if it requires
// SCEV expansion), but for the canonical induction the step is required to be
// 1, which is represented as live-in.
auto *StepC = dyn_cast<VPConstantInt>(getStepValue());
auto *StartC = dyn_cast<VPConstantInt>(getStartValue());
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == getRegion()->getCanonicalIVType();
}
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
if (FPBinOp)
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPLane(0));
Value *Index = State.get(getOperand(1), VPLane(0));
Value *DerivedIV = emitTransformedIndex(
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
State.set(this, DerivedIV, VPLane(0));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = DERIVED-IV ";
getStartValue()->printAsOperand(O, SlotTracker);
O << " + ";
getOperand(1)->printAsOperand(O, SlotTracker);
O << " * ";
getStepValue()->printAsOperand(O, SlotTracker);
}
#endif
void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
if (hasFastMathFlags())
State.Builder.setFastMathFlags(getFastMathFlags());
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step.
Value *BaseIV = State.get(getOperand(0), VPLane(0));
Value *Step = State.get(getStepValue(), VPLane(0));
IRBuilderBase &Builder = State.Builder;
// Ensure step has the same type as that of scalar IV.
Type *BaseIVTy = BaseIV->getType()->getScalarType();
assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (BaseIVTy->isIntegerTy()) {
AddOp = Instruction::Add;
MulOp = Instruction::Mul;
} else {
AddOp = InductionOpcode;
MulOp = Instruction::FMul;
}
// Determine the number of scalars we need to generate.
bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
// Compute the scalar steps and save the results in State.
unsigned StartLane = 0;
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
if (State.Lane) {
StartLane = State.Lane->getKnownLane();
EndLane = StartLane + 1;
}
Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
: Constant::getNullValue(BaseIVTy);
for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
// It is okay if the induction variable type cannot hold the lane number,
// we expect truncation in this case.
Constant *LaneValue =
BaseIVTy->isIntegerTy()
? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
/*ImplicitTrunc=*/true)
: ConstantFP::get(BaseIVTy, Lane);
Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
// The step returned by `createStepForVF` is a runtime-evaluated value
// when VF is scalable. Otherwise, it should be folded into a Constant.
assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
State.set(this, Add, VPLane(Lane));
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = SCALAR-STEPS ";
printOperands(O, SlotTracker);
}
#endif
bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
return vputils::isSingleScalar(Op);
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
// results in a vector of pointers when at least one operand of the GEP
// is vector-typed. Thus, to keep the representation compact, we only use
// vector-typed operands for loop-varying values.
bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
return Op->isDefinedOutsideLoopRegions();
});
if (AllOperandsAreInvariant) {
// If we are vectorizing, but the GEP has only loop-invariant operands,
// the GEP we build (by only using vector-typed operands for
// loop-varying values) would be a scalar pointer. Thus, to ensure we
// produce a vector of pointers, we need to either arbitrarily pick an
// operand to broadcast, or broadcast a clone of the original GEP.
// Here, we broadcast a clone of the original.
SmallVector<Value *> Ops;
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
Ops.push_back(State.get(getOperand(I), VPLane(0)));
auto *NewGEP =
State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
"", getGEPNoWrapFlags());
Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
State.set(this, Splat);
return;
}
// If the GEP has at least one loop-varying operand, we are sure to
// produce a vector of pointers unless VF is scalar.
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
// Collect all the indices for the new GEP. If any index is
// loop-invariant, we won't broadcast it.
SmallVector<Value *, 4> Indices;
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
VPValue *Operand = getOperand(I);
Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
}
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
"", getGEPNoWrapFlags());
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-GEP ";
O << (isPointerLoopInvariant() ? "Inv" : "Var");
for (size_t I = 0; I < getNumOperands() - 1; ++I)
O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
O << " ";
printAsOperand(O, SlotTracker);
O << " = getelementptr";
printFlags(O);
printOperands(O, SlotTracker);
}
#endif
void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
// The wide store needs to start at the last vector element.
Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
if (IndexTy != RunTimeVF->getType())
RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
// NumElt = Stride * CurrentPart * RunTimeVF
Value *NumElt = Builder.CreateMul(
ConstantInt::getSigned(IndexTy, Stride * (int64_t)CurrentPart),
RunTimeVF);
// LastLane = Stride * (RunTimeVF - 1)
Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
if (Stride != 1)
LastLane =
Builder.CreateMul(ConstantInt::getSigned(IndexTy, Stride), LastLane);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, NumElt, "",
getGEPNoWrapFlags());
ResultPtr = Builder.CreateGEP(getSourceElementType(), ResultPtr, LastLane, "",
getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = vector-end-pointer";
printFlags(O);
printOperands(O, SlotTracker);
}
#endif
void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
assert(getOffset() &&
"Expected prior simplification of recipe without offset");
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Offset = State.get(getOffset(), true);
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
O << " = vector-pointer";
printFlags(O);
printOperands(O, SlotTracker);
}
#endif
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// A blend will be expanded to a select VPInstruction, which will generate a
// scalar select if only the first lane is used.
if (vputils::onlyFirstLaneUsed(this))
VF = ElementCount::getFixed(1);
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "BLEND ";
printAsOperand(O, SlotTracker);
O << " =";
printFlags(O);
if (getNumIncomingValues() == 1) {
// Not a User of any mask: not really blending, this is a
// single-predecessor phi.
getIncomingValue(0)->printAsOperand(O, SlotTracker);
} else {
for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
if (I != 0)
O << " ";
getIncomingValue(I)->printAsOperand(O, SlotTracker);
if (I == 0 && isNormalized())
continue;
O << "/";
getMask(I)->printAsOperand(O, SlotTracker);
}
}
}
#endif
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Reduction being replicated.");
RecurKind Kind = getRecurrenceKind();
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
"In-loop AnyOf reductions aren't currently supported");
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
State.Builder.setFastMathFlags(getFastMathFlags());
Value *NewVecOp = State.get(getVecOp());
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, State.VF.isScalar());
VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
if (State.VF.isVector())
Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
NewVecOp = Select;
}
Value *NewRed;
Value *NextInChain;
if (isOrdered()) {
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
if (State.VF.isVector())
NewRed =
createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
else
NewRed = State.Builder.CreateBinOp(
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
PrevInChain, NewVecOp);
PrevInChain = NewRed;
NextInChain = NewRed;
} else if (isPartialReduction()) {
assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
"Unexpected partial reduction kind");
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
NewRed = State.Builder.CreateIntrinsic(
PrevInChain->getType(),
Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
: Intrinsic::vector_partial_reduce_fadd,
{PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
"partial.reduce");
PrevInChain = NewRed;
NextInChain = NewRed;
} else {
assert(isInLoop() &&
"The reduction must either be ordered, partial or in-loop");
Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
else
NextInChain = State.Builder.CreateBinOp(
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
PrevInChain, NewRed);
}
State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
}
void VPReductionEVLRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Reduction being replicated.");
auto &Builder = State.Builder;
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
Builder.setFastMathFlags(getFastMathFlags());
RecurKind Kind = getRecurrenceKind();
Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
Value *VecOp = State.get(getVecOp());
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Mask;
if (VPValue *CondOp = getCondOp())
Mask = State.get(CondOp);
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Value *NewRed;
if (isOrdered()) {
NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
} else {
NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
else
NewRed = Builder.CreateBinOp(
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
Prev);
}
State.set(this, NewRed, /*IsScalar*/ true);
}
InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
RecurKind RdxKind = getRecurrenceKind();
Type *ElementTy = Ctx.Types.inferScalarType(this);
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
std::optional<FastMathFlags> OptionalFMF =
ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
if (isPartialReduction()) {
InstructionCost CondCost = 0;
if (isConditional()) {
CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
auto *CondTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
CondTy, Pred, Ctx.CostKind);
}
return CondCost + Ctx.TTI.getPartialReductionCost(
Opcode, ElementTy, ElementTy, ElementTy, VF,
TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
OptionalFMF);
}
// TODO: Support any-of reductions.
assert(
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
// Note that TTI should model the cost of moving result to the scalar register
// and the BinOp cost in the getMinMaxReductionCost().
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
// Note that TTI should model the cost of moving result to the scalar register
// and the BinOp cost in the getArithmeticReductionCost().
return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
Ctx.CostKind);
}
VPExpressionRecipe::VPExpressionRecipe(
ExpressionTypes ExpressionType,
ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
: VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
assert(!ExpressionRecipes.empty() && "Nothing to combine?");
assert(
none_of(ExpressionRecipes,
[](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
"expression cannot contain recipes with side-effects");
// Maintain a copy of the expression recipes as a set of users.
SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
for (auto *R : ExpressionRecipes)
ExpressionRecipesAsSetOfUsers.insert(R);
// Recipes in the expression, except the last one, must only be used by
// (other) recipes inside the expression. If there are other users, external
// to the expression, use a clone of the recipe for external users.
for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
if (R != ExpressionRecipes.back() &&
any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
return !ExpressionRecipesAsSetOfUsers.contains(U);
})) {
// There are users outside of the expression. Clone the recipe and use the
// clone those external users.
VPSingleDefRecipe *CopyForExtUsers = R->clone();
R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
VPUser &U, unsigned) {
return !ExpressionRecipesAsSetOfUsers.contains(&U);
});
CopyForExtUsers->insertBefore(R);
}
if (R->getParent())
R->removeFromParent();
}
// Internalize all external operands to the expression recipes. To do so,
// create new temporary VPValues for all operands defined by a recipe outside
// the expression. The original operands are added as operands of the
// VPExpressionRecipe itself.
for (auto *R : ExpressionRecipes) {
for (const auto &[Idx, Op] : enumerate(R->operands())) {
auto *Def = Op->getDefiningRecipe();
if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
continue;
addOperand(Op);
LiveInPlaceholders.push_back(new VPSymbolicValue());
}
}
// Replace each external operand with the first one created for it in
// LiveInPlaceholders.
for (auto *R : ExpressionRecipes)
for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
R->replaceUsesOfWith(LiveIn, Tmp);
}
void VPExpressionRecipe::decompose() {
for (auto *R : ExpressionRecipes)
// Since the list could contain duplicates, make sure the recipe hasn't
// already been inserted.
if (!R->getParent())
R->insertBefore(this);
for (const auto &[Idx, Op] : enumerate(operands()))
LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
replaceAllUsesWith(ExpressionRecipes.back());
ExpressionRecipes.clear();
}
InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *RedTy = Ctx.Types.inferScalarType(this);
auto *SrcVecTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(
cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
unsigned Opcode = RecurrenceDescriptor::getOpcode(
cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
if (RedR->isPartialReduction())
return Ctx.TTI.getPartialReductionCost(
Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
TargetTransformInfo::getPartialReductionExtendKind(ExtR->getOpcode()),
TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
: std::nullopt);
else if (!RedTy->isFloatingPointTy())
// TTI::getExtendedReductionCost only supports integer types.
return Ctx.TTI.getExtendedReductionCost(
Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
std::nullopt, Ctx.CostKind);
else
return InstructionCost::getInvalid();
}
case ExpressionTypes::MulAccReduction:
return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
Ctx.CostKind);
case ExpressionTypes::ExtNegatedMulAccReduction:
assert(Opcode == Instruction::Add && "Unexpected opcode");
Opcode = Instruction::Sub;
[[fallthrough]];
case ExpressionTypes::ExtMulAccReduction: {
auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
if (RedR->isPartialReduction()) {
auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
return Ctx.TTI.getPartialReductionCost(
Opcode, Ctx.Types.inferScalarType(getOperand(0)),
Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
TargetTransformInfo::getPartialReductionExtendKind(
Ext0R->getOpcode()),
TargetTransformInfo::getPartialReductionExtendKind(
Ext1R->getOpcode()),
Mul->getOpcode(), Ctx.CostKind,
RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
: std::nullopt);
}
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
Opcode, RedTy, SrcVecTy, Ctx.CostKind);
}
}
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
}
bool VPExpressionRecipe::mayReadOrWriteMemory() const {
return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
return R->mayReadFromMemory() || R->mayWriteToMemory();
});
}
bool VPExpressionRecipe::mayHaveSideEffects() const {
assert(
none_of(ExpressionRecipes,
[](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
"expression cannot contain recipes with side-effects");
return false;
}
bool VPExpressionRecipe::isSingleScalar() const {
// Cannot use vputils::isSingleScalar(), because all external operands
// of the expression will be live-ins while bundled.
auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
return RR && !RR->isPartialReduction();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EXPRESSION ";
printAsOperand(O, SlotTracker);
O << " = ";
auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
getOperand(1)->printAsOperand(O, SlotTracker);
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
O << Instruction::getOpcodeName(Opcode) << " (";
getOperand(0)->printAsOperand(O, SlotTracker);
Red->printFlags(O);
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType();
if (Red->isConditional()) {
O << ", ";
Red->getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
break;
}
case ExpressionTypes::ExtNegatedMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (sub (0, mul";
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
Mul->printFlags(O);
O << "(";
getOperand(0)->printAsOperand(O, SlotTracker);
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType() << "), (";
getOperand(1)->printAsOperand(O, SlotTracker);
auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
<< *Ext1->getResultType() << ")";
if (Red->isConditional()) {
O << ", ";
Red->getCondOp()->printAsOperand(O, SlotTracker);
}
O << "))";
break;
}
case ExpressionTypes::MulAccReduction:
case ExpressionTypes::ExtMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (";
O << "mul";
bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
: ExpressionRecipes[0]);
Mul->printFlags(O);
if (IsExtended)
O << "(";
getOperand(0)->printAsOperand(O, SlotTracker);
if (IsExtended) {
auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType() << "), (";
} else {
O << ", ";
}
getOperand(1)->printAsOperand(O, SlotTracker);
if (IsExtended) {
auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
<< *Ext1->getResultType() << ")";
}
if (Red->isConditional()) {
O << ", ";
Red->getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
break;
}
}
}
void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
if (isPartialReduction())
O << Indent << "PARTIAL-REDUCE ";
else
O << Indent << "REDUCE ";
printAsOperand(O, SlotTracker);
O << " = ";
getChainOp()->printAsOperand(O, SlotTracker);
O << " +";
printFlags(O);
O << " reduce."
<< Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
<< " (";
getVecOp()->printAsOperand(O, SlotTracker);
if (isConditional()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
}
void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "REDUCE ";
printAsOperand(O, SlotTracker);
O << " = ";
getChainOp()->printAsOperand(O, SlotTracker);
O << " +";
printFlags(O);
O << " vp.reduce."
<< Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
<< " (";
getVecOp()->printAsOperand(O, SlotTracker);
O << ", ";
getEVL()->printAsOperand(O, SlotTracker);
if (isConditional()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
}
#endif
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
/// operands from \p RepRecipe instead of \p Instr's operands.
static void scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPLane &Lane, VPTransformState &State) {
assert((!Instr->getType()->isAggregateType() ||
canVectorizeTy(Instr->getType())) &&
"Expected vectorizable or non-aggregate type.");
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy) {
Cloned->setName(Instr->getName() + ".cloned");
Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
// The operands of the replicate recipe may have been narrowed, resulting in
// a narrower result type. Update the type of the cloned instruction to the
// correct type.
if (ResultTy != Cloned->getType())
Cloned->mutateType(ResultTy);
}
RepRecipe->applyFlags(*Cloned);
RepRecipe->applyMetadata(*Cloned);
if (RepRecipe->hasPredicate())
cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
if (auto DL = RepRecipe->getDebugLoc())
State.setDebugLocFrom(DL);
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (const auto &I : enumerate(RepRecipe->operands())) {
auto InputLane = Lane;
VPValue *Operand = I.value();
if (vputils::isSingleScalar(Operand))
InputLane = VPLane::getFirstLane();
Cloned->setOperand(I.index(), State.get(Operand, InputLane));
}
// Place the cloned scalar in the new loop.
State.Builder.Insert(Cloned);
State.set(RepRecipe, Cloned, Lane);
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<AssumeInst>(Cloned))
State.AC->registerAssumption(II);
assert(
(RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
"Expected a recipe is either within a region or all of its operands "
"are defined outside the vectorized region.");
}
void VPReplicateRecipe::execute(VPTransformState &State) {
Instruction *UI = getUnderlyingInstr();
if (!State.Lane) {
assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
"must have already been unrolled");
scalarizeInstruction(UI, this, VPLane(0), State);
return;
}
assert((State.VF.isScalar() || !isSingleScalar()) &&
"uniform recipe shouldn't be predicated");
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
scalarizeInstruction(UI, this, *State.Lane, State);
// Insert scalar instance packing it into a vector.
if (State.VF.isVector() && shouldPack()) {
Value *WideValue =
State.Lane->isFirstLane()
? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
: State.get(this);
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
*State.Lane));
}
}
bool VPReplicateRecipe::shouldPack() const {
// Find if the recipe is used by a widened recipe via an intervening
// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
return any_of(users(), [](const VPUser *U) {
if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
return !vputils::onlyScalarValuesUsed(PredR);
return false;
});
}
/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
/// which the legacy cost model computes a SCEV expression when computing the
/// address cost. Computing SCEVs for VPValues is incomplete and returns
/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
PredicatedScalarEvolution &PSE,
const Loop *L) {
const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
if (isa<SCEVCouldNotCompute>(Addr))
return Addr;
return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
}
/// Returns true if \p V is used as part of the address of another load or
/// store.
static bool isUsedByLoadStoreAddress(const VPUser *V) {
SmallPtrSet<const VPUser *, 4> Seen;
SmallVector<const VPUser *> WorkList = {V};
while (!WorkList.empty()) {
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
if (!Cur || !Seen.insert(Cur).second)
continue;
auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
// Skip blends that use V only through a compare by checking if any incoming
// value was already visited.
if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
[&](unsigned I) {
return Seen.contains(
Blend->getIncomingValue(I)->getDefiningRecipe());
}))
continue;
for (VPUser *U : Cur->users()) {
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
if (InterleaveR->getAddr() == Cur)
return true;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
if (RepR->getOpcode() == Instruction::Load &&
RepR->getOperand(0) == Cur)
return true;
if (RepR->getOpcode() == Instruction::Store &&
RepR->getOperand(1) == Cur)
return true;
}
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
if (MemR->getAddr() == Cur && MemR->isConsecutive())
return true;
}
}
// The legacy cost model only supports scalarization loads/stores with phi
// addresses, if the phi is directly used as load/store address. Don't
// traverse further for Blends.
if (Blend)
continue;
append_range(WorkList, Cur->users());
}
return false;
}
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
// transform, avoid computing their cost multiple times for now.
Ctx.SkipCostComputation.insert(UI);
if (VF.isScalable() && !isSingleScalar())
return InstructionCost::getInvalid();
switch (UI->getOpcode()) {
case Instruction::Alloca:
if (VF.isScalable())
return InstructionCost::getInvalid();
return Ctx.TTI.getArithmeticInstrCost(
Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
case Instruction::GetElementPtr:
// We mark this instruction as zero-cost because the cost of GEPs in
// vectorized code depends on whether the corresponding memory instruction
// is scalarized or not. Therefore, we handle GEPs with the memory
// instruction cost.
return 0;
case Instruction::Call: {
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
SmallVector<const VPValue *> ArgOps(drop_end(operands()));
SmallVector<Type *, 4> Tys;
for (const VPValue *ArgOp : ArgOps)
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
if (CalledFn->isIntrinsic())
// Various pseudo-intrinsics with costs of 0 are scalarized instead of
// vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
switch (CalledFn->getIntrinsicID()) {
case Intrinsic::assume:
case Intrinsic::lifetime_end:
case Intrinsic::lifetime_start:
case Intrinsic::sideeffect:
case Intrinsic::pseudoprobe:
case Intrinsic::experimental_noalias_scope_decl: {
assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
ElementCount::getFixed(1), Ctx) == 0 &&
"scalarizing intrinsic should be free");
return InstructionCost(0);
}
default:
break;
}
Type *ResultTy = Ctx.Types.inferScalarType(this);
InstructionCost ScalarCallCost =
Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
if (isSingleScalar()) {
if (CalledFn->isIntrinsic())
ScalarCallCost = std::min(
ScalarCallCost,
getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
ElementCount::getFixed(1), Ctx));
return ScalarCallCost;
}
return ScalarCallCost * VF.getFixedValue() +
Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
}
case Instruction::Add:
case Instruction::Sub:
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
case Instruction::FCmp:
return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
Ctx) *
(isSingleScalar() ? 1 : VF.getFixedValue());
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
case Instruction::URem: {
InstructionCost ScalarCost =
getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
if (isSingleScalar())
return ScalarCost;
// If any of the operands is from a different replicate region and has its
// cost skipped, it may have been forced to scalar. Fall back to legacy cost
// model to avoid cost mis-match.
if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
if (!PredR)
return false;
return Ctx.skipCostComputation(
dyn_cast_or_null<Instruction>(
PredR->getOperand(0)->getUnderlyingValue()),
VF.isVector());
}))
break;
ScalarCost = ScalarCost * VF.getFixedValue() +
Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
ScalarCost += VF.getFixedValue() *
Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
return ScalarCost;
}
case Instruction::Load:
case Instruction::Store: {
bool IsLoad = UI->getOpcode() == Instruction::Load;
const VPValue *PtrOp = getOperand(!IsLoad);
const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
break;
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
const Align Alignment = getLoadStoreAlignment(UI);
unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
bool UsedByLoadStoreAddress =
!PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
UsedByLoadStoreAddress ? UI : nullptr);
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
InstructionCost ScalarCost =
ScalarMemOpCost +
Ctx.TTI.getAddressComputationCost(
PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;
SmallVector<const VPValue *> OpsToScalarize;
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
// don't assign scalarization overhead in general, if the target prefers
// vectorized addressing or the loaded value is used as part of an address
// of another load or store.
if (!UsedByLoadStoreAddress) {
bool EfficientVectorLoadStore =
Ctx.TTI.supportsEfficientVectorElementLoadStore();
if (!(IsLoad && !PreferVectorizedAddressing) &&
!(!IsLoad && EfficientVectorLoadStore))
append_range(OpsToScalarize, operands());
if (!EfficientVectorLoadStore)
ResultTy = Ctx.Types.inferScalarType(this);
}
TTI::VectorInstrContext VIC =
IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
InstructionCost Cost =
(ScalarCost * VF.getFixedValue()) +
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator()) {
// TODO: Handle loop-invariant pointers in predicated blocks. For now,
// fall back to the legacy cost model.
if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
break;
Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
auto *VecI1Ty = VectorType::get(
IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
Cost += Ctx.TTI.getScalarizationOverhead(
VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
if (Ctx.useEmulatedMaskMemRefHack(UI, VF)) {
// Artificially setting to a high enough value to practically disable
// vectorization with such operations.
return 3000000;
}
}
return Cost;
}
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::AddrSpaceCast: {
return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
Ctx) *
(isSingleScalar() ? 1 : VF.getFixedValue());
}
case Instruction::ExtractValue:
case Instruction::InsertValue:
return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
}
return Ctx.getLegacyCost(UI, VF);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
if (!getUnderlyingInstr()->getType()->isVoidTy()) {
printAsOperand(O, SlotTracker);
O << " = ";
}
if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
O << "call";
printFlags(O);
O << "@" << CB->getCalledFunction()->getName() << "(";
interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
O << ")";
} else {
O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
printFlags(O);
printOperands(O, SlotTracker);
}
if (shouldPack())
O << " (S->V)";
}
#endif
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Lane && "Branch on Mask works only on single instance.");
VPValue *BlockInMask = getOperand(0);
Value *ConditionBit = State.get(BlockInMask, *State.Lane);
// Replace the temporary unreachable terminator with a new conditional branch,
// whose two destinations will be set later when they are created.
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
assert(isa<UnreachableInst>(CurrentTerminator) &&
"Expected to replace unreachable terminator with conditional branch.");
auto CondBr =
State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
CondBr->setSuccessor(0, nullptr);
CurrentTerminator->eraseFromParent();
}
InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// The legacy cost model doesn't assign costs to branches for individual
// replicate regions. Match the current behavior in the VPlan cost model for
// now.
return 0;
}
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Lane && "Predicated instruction PHI works per instance.");
Instruction *ScalarPredInst =
cast<Instruction>(State.get(getOperand(0), *State.Lane));
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.");
assert(isa<VPReplicateRecipe>(getOperand(0)) &&
"operand must be VPReplicateRecipe");
// By current pack/unpack logic we need to generate only a single phi node: if
// a vector value for the predicated instruction exists at this point it means
// the instruction has vector users only, and a phi for the vector value is
// needed. In this case the recipe of the predicated instruction is marked to
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
if (State.hasVectorValue(getOperand(0))) {
auto *VecI = cast<Instruction>(State.get(getOperand(0)));
assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
"Packed operands must generate an insertelement or insertvalue");
// If VectorI is a struct, it will be a sequence like:
// %1 = insertvalue %unmodified, %x, 0
// %2 = insertvalue %1, %y, 1
// %VectorI = insertvalue %2, %z, 2
// To get the unmodified vector we need to look through the chain.
if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
VecI = cast<InsertValueInst>(VecI->getOperand(0));
PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
if (State.hasVectorValue(this))
State.reset(this, VPhi);
else
State.set(this, VPhi);
// NOTE: Currently we need to update the value of the operand, so the next
// predicated iteration inserts its generated value in the correct vector.
State.reset(getOperand(0), VPhi);
} else {
if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
return;
Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
PredicatingBB);
Phi->addIncoming(ScalarPredInst, PredicatedBB);
if (State.hasScalarValue(this, *State.Lane))
State.reset(this, Phi, *State.Lane);
else
State.set(this, Phi, *State.Lane);
// NOTE: Currently we need to update the value of the operand, so the next
// predicated iteration inserts its generated value in the correct vector.
State.reset(getOperand(0), Phi, *State.Lane);
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "PHI-PREDICATED-INSTRUCTION ";
printAsOperand(O, SlotTracker);
O << " = ";
printOperands(O, SlotTracker);
}
#endif
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
? Instruction::Load
: Instruction::Store;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
Type *PtrTy = Ptr->getType();
// If the address value is uniform across all lanes, then the address can be
// calculated with scalar type and broadcast.
if (!vputils::isSingleScalar(getAddr()))
PtrTy = toVectorTy(PtrTy, VF);
unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
: isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
: isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
: Intrinsic::vp_scatter;
return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
Ctx.CostKind) +
Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
&Ingredient),
Ctx.CostKind);
}
InstructionCost Cost = 0;
if (IsMasked) {
unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
: Intrinsic::masked_store;
Cost += Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
: getOperand(1));
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
return Cost;
}
void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
Value *Mask = nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
Mask = State.get(VPMask);
if (isReverse())
Mask = Builder.CreateVectorReverse(Mask, "reverse");
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
Value *NewLI;
if (CreateGather) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
NewLI =
Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
PoisonValue::get(DataTy), "wide.masked.load");
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
State.set(this, NewLI);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = load ";
printOperands(O, SlotTracker);
}
#endif
/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
Value *EVL, const Twine &Name) {
VectorType *ValTy = cast<VectorType>(Operand->getType());
Value *AllTrueMask =
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
{Operand, AllTrueMask, EVL}, nullptr, Name);
}
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
CallInst *NewLI;
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
{Addr, Mask, EVL}, nullptr, "vp.op.load");
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
State.set(this, Res);
}
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
if (!Consecutive || IsMasked)
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
// here because the EVL recipes using EVL to replace the tail mask. But in the
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
// don't need to compare to the legacy cost model.
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = vp.load ";
printOperands(O, SlotTracker);
}
#endif
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
auto &Builder = State.Builder;
Value *Mask = nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
Mask = State.get(VPMask);
if (isReverse())
Mask = Builder.CreateVectorReverse(Mask, "reverse");
}
Value *StoredVal = State.get(StoredVPValue);
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
else if (Mask)
NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
else
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
applyMetadata(*NewSI);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN store ";
printOperands(O, SlotTracker);
}
#endif
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
auto &Builder = State.Builder;
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_store,
{StoredVal, Addr, Mask, EVL});
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
applyMetadata(*NewSI);
}
InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
if (!Consecutive || IsMasked)
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
// here because the EVL recipes using EVL to replace the tail mask. But in the
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
// don't need to compare to the legacy cost model.
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN vp.store ";
printOperands(O, SlotTracker);
}
#endif
static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
VectorType *DstVTy, const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
auto VF = DstVTy->getElementCount();
auto *SrcVecTy = cast<VectorType>(V->getType());
assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
"Vector elements must have same size");
// Do a direct cast if element types are castable.
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
return Builder.CreateBitOrPointerCast(V, DstVTy);
}
// V cannot be directly casted to desired vector type.
// May happen when V is a floating point vector but DstVTy is a vector of
// pointers or vice-versa. Handle this using a two-step bitcast using an
// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
"Only one type should be a pointer type");
assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
"Only one type should be a floating point type");
Type *IntTy =
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
auto *VecIntTy = VectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
}
/// Return a vector containing interleaved elements from multiple
/// smaller input vectors.
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
const Twine &Name) {
unsigned Factor = Vals.size();
assert(Factor > 1 && "Tried to interleave invalid number of vectors");
VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
#ifndef NDEBUG
for (Value *Val : Vals)
assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
#endif
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
return Builder.CreateVectorInterleave(Vals, Name);
}
// Fixed length. Start by concatenating all vectors into a wide vector.
Value *WideVec = concatenateVectors(Builder, Vals);
// Interleave the elements into the wide vector.
const unsigned NumElts = VecTy->getElementCount().getFixedValue();
return Builder.CreateShuffleVector(
WideVec, createInterleaveMask(NumElts, Factor), Name);
}
// Try to vectorize the interleave group that \p Instr belongs to.
//
// E.g. Translate following interleaved load group (factor = 3):
// for (i = 0; i < N; i+=3) {
// R = Pic[i]; // Member of index 0
// G = Pic[i+1]; // Member of index 1
// B = Pic[i+2]; // Member of index 2
// ... // do something to R, G, B
// }
// To:
// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
//
// Or translate following interleaved store group (factor = 3):
// for (i = 0; i < N; i+=3) {
// ... do something to R, G, B
// Pic[i] = R; // Member of index 0
// Pic[i+1] = G; // Member of index 1
// Pic[i+2] = B; // Member of index 2
// }
// To:
// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
"Masking gaps for scalable vectors is not yet supported.");
const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
VPValue *BlockInMask = getMask();
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
auto CreateGroupMask = [&BlockInMask, &State,
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}
if (!BlockInMask)
return MaskForGaps;
Value *ResBlockInMask = State.get(BlockInMask);
Value *ShuffledMask = State.Builder.CreateShuffleVector(
ResBlockInMask,
createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
"interleaved.mask");
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
ShuffledMask, MaskForGaps)
: ShuffledMask;
};
const DataLayout &DL = Instr->getDataLayout();
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
if (needsMaskForGaps()) {
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
Instruction *NewLoad;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = CreateGroupMask(MaskForGaps);
Value *PoisonVec = PoisonValue::get(VecTy);
NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
Group->getAlign(), GroupMask,
PoisonVec, "wide.masked.vec");
} else
NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
Group->getAlign(), "wide.vec");
applyMetadata(*NewLoad);
// TODO: Also manage existing metadata using VPIRMetadata.
Group->addMetadata(NewLoad);
ArrayRef<VPRecipeValue *> VPDefs = definedValues();
if (VecTy->isScalableTy()) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
NewLoad = State.Builder.CreateIntrinsic(
Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
}
auto CreateStridedVector = [&InterleaveFactor, &State,
&NewLoad](unsigned Index) -> Value * {
assert(Index < InterleaveFactor && "Illegal group index");
if (State.VF.isScalable())
return State.Builder.CreateExtractValue(NewLoad, Index);
// For fixed length VF, use shuffle to extract the sub-vectors from the
// wide load.
auto StrideMask =
createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
"strided.vec");
};
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
// Skip the gaps in the group.
if (!Member)
continue;
Value *StridedVec = CreateStridedVector(I);
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
StridedVec =
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
}
if (Group->isReverse())
StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
State.set(VPDefs[J], StridedVec);
++J;
}
return;
}
// The sub vector type for current instruction.
auto *SubVT = VectorType::get(ScalarTy, State.VF);
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
unsigned StoredIdx = 0;
for (unsigned i = 0; i < InterleaveFactor; i++) {
assert((Group->getMember(i) || MaskForGaps) &&
"Fail to get a member from an interleaved store group");
Instruction *Member = Group->getMember(i);
// Skip the gaps in the group.
if (!Member) {
Value *Undef = PoisonValue::get(SubVT);
StoredVecs.push_back(Undef);
continue;
}
Value *StoredVec = State.get(StoredValues[StoredIdx]);
++StoredIdx;
if (Group->isReverse())
StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
// If this member has different type, cast it to a unified type.
if (StoredVec->getType() != SubVT)
StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
StoredVecs.push_back(StoredVec);
}
// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = CreateGroupMask(MaskForGaps);
NewStoreInstr = State.Builder.CreateMaskedStore(
IVec, ResAddr, Group->getAlign(), GroupMask);
} else
NewStoreInstr =
State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
applyMetadata(*NewStoreInstr);
// TODO: Also manage existing metadata using VPIRMetadata.
Group->addMetadata(NewStoreInstr);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
getAddr()->printAsOperand(O, SlotTracker);
VPValue *Mask = getMask();
if (Mask) {
O << ", ";
Mask->printAsOperand(O, SlotTracker);
}
unsigned OpIdx = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i) {
if (!IG->getMember(i))
continue;
if (getNumStoreOperands() > 0) {
O << "\n" << Indent << " store ";
getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
O << " to index " << i;
} else {
O << "\n" << Indent << " ";
getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
O << " = load from index " << i;
}
++OpIdx;
}
}
#endif
void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
assert(State.VF.isScalable() &&
"Only support scalable VF for EVL tail-folding.");
assert(!needsMaskForGaps() &&
"Masking gaps for scalable vectors is not yet supported.");
const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave/interleave factor for scalable vectors");
ElementCount WideVF = State.VF * InterleaveFactor;
auto *VecTy = VectorType::get(ScalarTy, WideVF);
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
Value *EVL = State.get(getEVL(), VPLane(0));
Value *InterleaveEVL = State.Builder.CreateMul(
EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
/* NUW= */ true, /* NSW= */ true);
LLVMContext &Ctx = State.Builder.getContext();
Value *GroupMask = nullptr;
if (VPValue *BlockInMask = getMask()) {
SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
} else {
GroupMask =
State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
}
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
CallInst *NewLoad = State.Builder.CreateIntrinsic(
VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
"wide.vp.load");
NewLoad->addParamAttr(0,
Attribute::getWithAlignment(Ctx, Group->getAlign()));
applyMetadata(*NewLoad);
// TODO: Also manage existing metadata using VPIRMetadata.
Group->addMetadata(NewLoad);
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
NewLoad = State.Builder.CreateIntrinsic(
Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
const DataLayout &DL = Instr->getDataLayout();
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
// Skip the gaps in the group.
if (!Member)
continue;
Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
StridedVec =
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
}
State.set(getVPValue(J), StridedVec);
++J;
}
return;
} // End for interleaved load.
// The sub vector type for current instruction.
auto *SubVT = VectorType::get(ScalarTy, State.VF);
// Vectorize the interleaved store group.
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
const DataLayout &DL = Instr->getDataLayout();
for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
Instruction *Member = Group->getMember(I);
// Skip the gaps in the group.
if (!Member) {
StoredVecs.push_back(PoisonValue::get(SubVT));
continue;
}
Value *StoredVec = State.get(StoredValues[StoredIdx]);
// If this member has different type, cast it to a unified type.
if (StoredVec->getType() != SubVT)
StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
StoredVecs.push_back(StoredVec);
++StoredIdx;
}
// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
CallInst *NewStore =
State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
{IVec, ResAddr, GroupMask, InterleaveEVL});
NewStore->addParamAttr(1,
Attribute::getWithAlignment(Ctx, Group->getAlign()));
applyMetadata(*NewStore);
// TODO: Also manage existing metadata using VPIRMetadata.
Group->addMetadata(NewStore);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
getAddr()->printAsOperand(O, SlotTracker);
O << ", ";
getEVL()->printAsOperand(O, SlotTracker);
if (VPValue *Mask = getMask()) {
O << ", ";
Mask->printAsOperand(O, SlotTracker);
}
unsigned OpIdx = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i) {
if (!IG->getMember(i))
continue;
if (getNumStoreOperands() > 0) {
O << "\n" << Indent << " vp.store ";
getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
O << " to index " << i;
} else {
O << "\n" << Indent << " ";
getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
O << " = vp.load from index " << i;
}
++OpIdx;
}
}
#endif
InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;
for (unsigned Idx = 0; IG->getFactor(); ++Idx)
if (auto *Member = IG->getMember(Idx)) {
if (Member == InsertPos)
break;
InsertPosIdx++;
}
Type *ValTy = Ctx.Types.inferScalarType(
getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
: getStoredValues()[InsertPosIdx]);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
unsigned InterleaveFactor = IG->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
// Holds the indices of existing members in the interleaved group.
SmallVector<unsigned, 4> Indices;
for (unsigned IF = 0; IF < InterleaveFactor; IF++)
if (IG->getMember(IF))
Indices.push_back(IF);
// Calculate the cost of the whole interleaved group.
InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
if (!IG->isReverse())
return Cost;
return Cost + IG->getNumMembers() *
Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
VectorTy, VectorTy, {}, Ctx.CostKind,
0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
O << " = CANONICAL-INDUCTION ";
printOperands(O, SlotTracker);
}
#endif
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
return vputils::onlyScalarValuesUsed(this) &&
(!IsScalable || vputils::onlyFirstLaneUsed(this));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPointerInductionRecipe::printRecipe(
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
assert((getNumOperands() == 3 || getNumOperands() == 5) &&
"unexpected number of operands");
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
O << " = WIDEN-POINTER-INDUCTION ";
getStartValue()->printAsOperand(O, SlotTracker);
O << ", ";
getStepValue()->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
if (getNumOperands() == 5) {
O << ", ";
getOperand(3)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(4)->printAsOperand(O, SlotTracker);
}
}
void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
O << " = EXPAND SCEV " << *Expr;
}
#endif
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
Type *STy = CanonicalIV->getType();
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
ElementCount VF = State.VF;
Value *VStart = VF.isScalar()
? CanonicalIV
: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
if (VF.isVector()) {
VStep = Builder.CreateVectorSplat(VF, VStep);
VStep =
Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
}
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
State.set(this, CanonicalVectorIV);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
O << " = WIDEN-CANONICAL-INDUCTION ";
printOperands(O, SlotTracker);
}
#endif
void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
// Create a vector from the initial value.
auto *VectorInit = getStartValue()->getLiveInIRValue();
Type *VecTy = State.VF.isScalar()
? VectorInit->getType()
: VectorType::get(VectorInit->getType(), State.VF);
BasicBlock *VectorPH =
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
if (State.VF.isVector()) {
auto *IdxTy = Builder.getInt32Ty();
auto *One = ConstantInt::get(IdxTy, 1);
IRBuilder<>::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(VectorPH->getTerminator());
auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
VectorInit = Builder.CreateInsertElement(
PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
}
// Create a phi node for the new recurrence.
PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
Phi->addIncoming(VectorInit, VectorPH);
State.set(this, Phi);
}
InstructionCost
VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
if (VF.isScalar())
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
return 0;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPFirstOrderRecurrencePHIRecipe::printRecipe(
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
}
#endif
void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VPValue *StartVPV = getStartValue();
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
BasicBlock *VectorPH =
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
bool ScalarPHI = State.VF.isScalar() || isInLoop();
Value *StartV = State.get(StartVPV, ScalarPHI);
Type *VecTy = StartV->getType();
BasicBlock *HeaderBB = State.CFG.PrevBB;
assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
"recipe must be in the vector loop header");
auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
State.set(this, Phi, isInLoop());
Phi->addIncoming(StartV, VectorPH);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-REDUCTION-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
if (getVFScaleFactor() > 1)
O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
}
#endif
void VPWidenPHIRecipe::execute(VPTransformState &State) {
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
State.set(this, VecPhi);
}
InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printPhiOperands(O, SlotTracker);
}
#endif
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
BasicBlock *VectorPH =
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
Value *StartMask = State.get(getOperand(0));
PHINode *Phi =
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
Phi->addIncoming(StartMask, VectorPH);
State.set(this, Phi);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "ACTIVE-LANE-MASK-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
}
#endif
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
}
#endif