Previously, VPlanPredicator only reused the mask of the loop header when a block post-dominates the header. This patch generalizes the optimization to reuse the mask of immediate dominator when a block post-dominates its immediate dominator. This reduces more redundant mask computations, simplifies the generated code, and improves EVL tail folding. Based on #173265 Fix #173260
309 lines
12 KiB
C++
309 lines
12 KiB
C++
//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file implements predication for VPlans.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "VPRecipeBuilder.h"
|
|
#include "VPlan.h"
|
|
#include "VPlanCFG.h"
|
|
#include "VPlanDominatorTree.h"
|
|
#include "VPlanPatternMatch.h"
|
|
#include "VPlanTransforms.h"
|
|
#include "VPlanUtils.h"
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
|
|
using namespace llvm;
|
|
using namespace VPlanPatternMatch;
|
|
|
|
namespace {
|
|
class VPPredicator {
|
|
/// Builder to construct recipes to compute masks.
|
|
VPBuilder Builder;
|
|
|
|
/// Dominator tree for the VPlan.
|
|
VPDominatorTree VPDT;
|
|
|
|
/// Post-dominator tree for the VPlan.
|
|
VPPostDominatorTree VPPDT;
|
|
|
|
/// When we if-convert we need to create edge masks. We have to cache values
|
|
/// so that we don't end up with exponential recursion/IR.
|
|
using EdgeMaskCacheTy =
|
|
DenseMap<std::pair<const VPBasicBlock *, const VPBasicBlock *>,
|
|
VPValue *>;
|
|
using BlockMaskCacheTy = DenseMap<const VPBasicBlock *, VPValue *>;
|
|
EdgeMaskCacheTy EdgeMaskCache;
|
|
|
|
BlockMaskCacheTy BlockMaskCache;
|
|
|
|
/// Create an edge mask for every destination of cases and/or default.
|
|
void createSwitchEdgeMasks(const VPInstruction *SI);
|
|
|
|
/// Computes and return the predicate of the edge between \p Src and \p Dst,
|
|
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
|
|
VPValue *createEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst);
|
|
|
|
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
|
|
/// already have a mask.
|
|
void setBlockInMask(const VPBasicBlock *VPBB, VPValue *Mask) {
|
|
// TODO: Include the masks as operands in the predicated VPlan directly to
|
|
// avoid keeping the map of masks beyond the predication transform.
|
|
assert(!getBlockInMask(VPBB) && "Mask already set");
|
|
BlockMaskCache[VPBB] = Mask;
|
|
}
|
|
|
|
/// Record \p Mask as the mask of the edge from \p Src to \p Dst. The edge is
|
|
/// expected to not have a mask already.
|
|
VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst,
|
|
VPValue *Mask) {
|
|
assert(Src != Dst && "Src and Dst must be different");
|
|
assert(!getEdgeMask(Src, Dst) && "Mask already set");
|
|
return EdgeMaskCache[{Src, Dst}] = Mask;
|
|
}
|
|
|
|
public:
|
|
VPPredicator(VPlan &Plan) : VPDT(Plan), VPPDT(Plan) {}
|
|
|
|
/// Returns the *entry* mask for \p VPBB.
|
|
VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
|
|
return BlockMaskCache.lookup(VPBB);
|
|
}
|
|
|
|
/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
|
|
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
|
|
return EdgeMaskCache.lookup({Src, Dst});
|
|
}
|
|
|
|
/// Compute the predicate of \p VPBB.
|
|
void createBlockInMask(VPBasicBlock *VPBB);
|
|
|
|
/// Convert phi recipes in \p VPBB to VPBlendRecipes.
|
|
void convertPhisToBlends(VPBasicBlock *VPBB);
|
|
};
|
|
} // namespace
|
|
|
|
VPValue *VPPredicator::createEdgeMask(const VPBasicBlock *Src,
|
|
const VPBasicBlock *Dst) {
|
|
assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
|
|
|
|
// Look for cached value.
|
|
VPValue *EdgeMask = getEdgeMask(Src, Dst);
|
|
if (EdgeMask)
|
|
return EdgeMask;
|
|
|
|
VPValue *SrcMask = getBlockInMask(Src);
|
|
|
|
// If there's a single successor, there's no terminator recipe.
|
|
if (Src->getNumSuccessors() == 1)
|
|
return setEdgeMask(Src, Dst, SrcMask);
|
|
|
|
auto *Term = cast<VPInstruction>(Src->getTerminator());
|
|
if (Term->getOpcode() == Instruction::Switch) {
|
|
createSwitchEdgeMasks(Term);
|
|
return getEdgeMask(Src, Dst);
|
|
}
|
|
|
|
assert(Term->getOpcode() == VPInstruction::BranchOnCond &&
|
|
"Unsupported terminator");
|
|
if (Src->getSuccessors()[0] == Src->getSuccessors()[1])
|
|
return setEdgeMask(Src, Dst, SrcMask);
|
|
|
|
EdgeMask = Term->getOperand(0);
|
|
assert(EdgeMask && "No Edge Mask found for condition");
|
|
|
|
if (Src->getSuccessors()[0] != Dst)
|
|
EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc());
|
|
|
|
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
|
|
// The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
|
|
// is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
|
|
// instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
|
|
EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc());
|
|
}
|
|
|
|
return setEdgeMask(Src, Dst, EdgeMask);
|
|
}
|
|
|
|
void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
|
|
// Start inserting after the block's phis, which be replaced by blends later.
|
|
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
|
|
|
|
// Reuse the mask of the immediate dominator if the VPBB post-dominates the
|
|
// immediate dominator.
|
|
auto *IDom = VPDT.getNode(VPBB)->getIDom();
|
|
assert(IDom && "Block in loop must have immediate dominator");
|
|
auto *IDomBB = cast<VPBasicBlock>(IDom->getBlock());
|
|
if (VPPDT.properlyDominates(VPBB, IDomBB)) {
|
|
setBlockInMask(VPBB, getBlockInMask(IDomBB));
|
|
return;
|
|
}
|
|
// All-one mask is modelled as no-mask following the convention for masked
|
|
// load/store/gather/scatter. Initialize BlockMask to no-mask.
|
|
VPValue *BlockMask = nullptr;
|
|
// This is the block mask. We OR all unique incoming edges.
|
|
for (auto *Predecessor : SetVector<VPBlockBase *>(
|
|
VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
|
|
VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
|
|
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
|
|
// too.
|
|
setBlockInMask(VPBB, EdgeMask);
|
|
return;
|
|
}
|
|
|
|
if (!BlockMask) { // BlockMask has its initial nullptr value.
|
|
BlockMask = EdgeMask;
|
|
continue;
|
|
}
|
|
|
|
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
|
|
}
|
|
|
|
setBlockInMask(VPBB, BlockMask);
|
|
}
|
|
|
|
void VPPredicator::createSwitchEdgeMasks(const VPInstruction *SI) {
|
|
const VPBasicBlock *Src = SI->getParent();
|
|
|
|
// Create masks where SI is a switch. We create masks for all edges from SI's
|
|
// parent block at the same time. This is more efficient, as we can create and
|
|
// collect compares for all cases once.
|
|
VPValue *Cond = SI->getOperand(0);
|
|
VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
|
|
MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
|
|
for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) {
|
|
VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
|
|
assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
|
|
// Cases whose destination is the same as default are redundant and can
|
|
// be ignored - they will get there anyhow.
|
|
if (Dst == DefaultDst)
|
|
continue;
|
|
auto &Compares = Dst2Compares[Dst];
|
|
VPValue *V = SI->getOperand(Idx + 1);
|
|
Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
|
|
}
|
|
|
|
// We need to handle 2 separate cases below for all entries in Dst2Compares,
|
|
// which excludes destinations matching the default destination.
|
|
VPValue *SrcMask = getBlockInMask(Src);
|
|
VPValue *DefaultMask = nullptr;
|
|
for (const auto &[Dst, Conds] : Dst2Compares) {
|
|
// 1. Dst is not the default destination. Dst is reached if any of the
|
|
// cases with destination == Dst are taken. Join the conditions for each
|
|
// case whose destination == Dst using an OR.
|
|
VPValue *Mask = Conds[0];
|
|
for (VPValue *V : drop_begin(Conds))
|
|
Mask = Builder.createOr(Mask, V);
|
|
if (SrcMask)
|
|
Mask = Builder.createLogicalAnd(SrcMask, Mask);
|
|
setEdgeMask(Src, Dst, Mask);
|
|
|
|
// 2. Create the mask for the default destination, which is reached if
|
|
// none of the cases with destination != default destination are taken.
|
|
// Join the conditions for each case where the destination is != Dst using
|
|
// an OR and negate it.
|
|
DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
|
|
}
|
|
|
|
if (DefaultMask) {
|
|
DefaultMask = Builder.createNot(DefaultMask);
|
|
if (SrcMask)
|
|
DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
|
|
} else {
|
|
// There are no destinations other than the default destination, so this is
|
|
// an unconditional branch.
|
|
DefaultMask = SrcMask;
|
|
}
|
|
setEdgeMask(Src, DefaultDst, DefaultMask);
|
|
}
|
|
|
|
void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
|
|
SmallVector<VPPhi *> Phis;
|
|
for (VPRecipeBase &R : VPBB->phis())
|
|
Phis.push_back(cast<VPPhi>(&R));
|
|
for (VPPhi *PhiR : Phis) {
|
|
// The non-header Phi is converted into a Blend recipe below,
|
|
// so we don't have to worry about the insertion order and we can just use
|
|
// the builder. At this point we generate the predication tree. There may
|
|
// be duplications since this is a simple recursive scan, but future
|
|
// optimizations will clean it up.
|
|
|
|
auto NotPoison = make_filter_range(PhiR->incoming_values(), [](VPValue *V) {
|
|
return !match(V, m_Poison());
|
|
});
|
|
if (all_equal(NotPoison)) {
|
|
PhiR->replaceAllUsesWith(NotPoison.empty() ? PhiR->getIncomingValue(0)
|
|
: *NotPoison.begin());
|
|
PhiR->eraseFromParent();
|
|
continue;
|
|
}
|
|
|
|
SmallVector<VPValue *, 2> OperandsWithMask;
|
|
for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
|
|
OperandsWithMask.push_back(InVPV);
|
|
OperandsWithMask.push_back(createEdgeMask(InVPBB, VPBB));
|
|
}
|
|
PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
|
|
auto *Blend =
|
|
new VPBlendRecipe(IRPhi, OperandsWithMask, *PhiR, PhiR->getDebugLoc());
|
|
Builder.insert(Blend);
|
|
PhiR->replaceAllUsesWith(Blend);
|
|
PhiR->eraseFromParent();
|
|
}
|
|
}
|
|
|
|
void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
|
|
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
|
|
// Scan the body of the loop in a topological order to visit each basic block
|
|
// after having visited its predecessor basic blocks.
|
|
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
|
|
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
|
|
Header);
|
|
VPPredicator Predicator(Plan);
|
|
for (VPBlockBase *VPB : RPOT) {
|
|
// Non-outer regions with VPBBs only are supported at the moment.
|
|
auto *VPBB = cast<VPBasicBlock>(VPB);
|
|
// Introduce the mask for VPBB, which may introduce needed edge masks, and
|
|
// convert all phi recipes of VPBB to blend recipes unless VPBB is the
|
|
// header.
|
|
if (VPBB != Header) {
|
|
Predicator.createBlockInMask(VPBB);
|
|
Predicator.convertPhisToBlends(VPBB);
|
|
}
|
|
|
|
VPValue *BlockMask = Predicator.getBlockInMask(VPBB);
|
|
if (!BlockMask)
|
|
continue;
|
|
|
|
// Mask all VPInstructions in the block.
|
|
for (VPRecipeBase &R : *VPBB) {
|
|
if (auto *VPI = dyn_cast<VPInstruction>(&R))
|
|
VPI->addMask(BlockMask);
|
|
}
|
|
}
|
|
|
|
// Linearize the blocks of the loop into one serial chain.
|
|
VPBlockBase *PrevVPBB = nullptr;
|
|
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
|
|
auto Successors = to_vector(VPBB->getSuccessors());
|
|
if (Successors.size() > 1)
|
|
VPBB->getTerminator()->eraseFromParent();
|
|
|
|
// Flatten the CFG in the loop. To do so, first disconnect VPBB from its
|
|
// successors. Then connect VPBB to the previously visited VPBB.
|
|
for (auto *Succ : Successors)
|
|
VPBlockUtils::disconnectBlocks(VPBB, Succ);
|
|
if (PrevVPBB)
|
|
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
|
|
|
|
PrevVPBB = VPBB;
|
|
}
|
|
}
|