Split the input vector with an extend_low and high and then split the results again with extend_low and high for a total of 6 instructions. This is removes 3 shuffles and a couple of extends.
628 lines
22 KiB
C++
628 lines
22 KiB
C++
//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file defines the WebAssembly-specific TargetTransformInfo
|
|
/// implementation.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "WebAssemblyTargetTransformInfo.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/IntrinsicsWebAssembly.h"
|
|
#include "llvm/Transforms/InstCombine/InstCombiner.h"
|
|
|
|
#include "llvm/CodeGen/CostTable.h"
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "wasmtti"
|
|
|
|
TargetTransformInfo::PopcntSupportKind
|
|
WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
|
|
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
|
return TargetTransformInfo::PSK_FastHardware;
|
|
}
|
|
|
|
unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
|
|
unsigned Result = BaseT::getNumberOfRegisters(ClassID);
|
|
|
|
// For SIMD, use at least 16 registers, as a rough guess.
|
|
bool Vector = (ClassID == 1);
|
|
if (Vector)
|
|
Result = std::max(Result, 16u);
|
|
|
|
return Result;
|
|
}
|
|
|
|
TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
|
|
TargetTransformInfo::RegisterKind K) const {
|
|
switch (K) {
|
|
case TargetTransformInfo::RGK_Scalar:
|
|
return TypeSize::getFixed(64);
|
|
case TargetTransformInfo::RGK_FixedWidthVector:
|
|
return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
|
|
case TargetTransformInfo::RGK_ScalableVector:
|
|
return TypeSize::getScalable(0);
|
|
}
|
|
|
|
llvm_unreachable("Unsupported register kind");
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
|
|
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
|
|
ArrayRef<const Value *> Args, const Instruction *CxtI) const {
|
|
|
|
if (ST->hasSIMD128()) {
|
|
static const CostTblEntry ArithCostTbl[]{
|
|
// extmul + (maybe awkward) shuffle
|
|
{ISD::MUL, MVT::v8i8, 4},
|
|
// 2x extmul + (okay) shuffle
|
|
{ISD::MUL, MVT::v16i8, 4},
|
|
// extmul
|
|
{ISD::MUL, MVT::v4i16, 1},
|
|
// extmul
|
|
{ISD::MUL, MVT::v2i32, 1},
|
|
};
|
|
EVT DstVT = TLI->getValueType(DL, Ty);
|
|
if (DstVT.isSimple()) {
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
if (const auto *Entry =
|
|
CostTableLookup(ArithCostTbl, ISD, DstVT.getSimpleVT()))
|
|
return Entry->Cost;
|
|
}
|
|
}
|
|
|
|
InstructionCost Cost =
|
|
BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
|
|
Opcode, Ty, CostKind, Op1Info, Op2Info);
|
|
|
|
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
|
|
switch (Opcode) {
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::Shl:
|
|
// SIMD128's shifts currently only accept a scalar shift count. For each
|
|
// element, we'll need to extract, op, insert. The following is a rough
|
|
// approximation.
|
|
if (!Op2Info.isUniform())
|
|
Cost =
|
|
cast<FixedVectorType>(VTy)->getNumElements() *
|
|
(TargetTransformInfo::TCC_Basic +
|
|
getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
|
|
TargetTransformInfo::TCC_Basic);
|
|
break;
|
|
}
|
|
}
|
|
return Cost;
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
|
|
unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
|
|
TTI::TargetCostKind CostKind, const Instruction *I) const {
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
auto SrcTy = TLI->getValueType(DL, Src);
|
|
auto DstTy = TLI->getValueType(DL, Dst);
|
|
|
|
if (!SrcTy.isSimple() || !DstTy.isSimple()) {
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
|
|
}
|
|
|
|
if (!ST->hasSIMD128()) {
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
|
|
}
|
|
|
|
auto DstVT = DstTy.getSimpleVT();
|
|
auto SrcVT = SrcTy.getSimpleVT();
|
|
|
|
if (I && I->hasOneUser()) {
|
|
auto *SingleUser = cast<Instruction>(*I->user_begin());
|
|
int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
|
|
|
|
// extmul_low support
|
|
if (UserISD == ISD::MUL &&
|
|
(ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
|
|
// Free low extensions.
|
|
if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
|
|
(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
|
|
(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
|
|
return 0;
|
|
}
|
|
// Will require an additional extlow operation for the intermediate
|
|
// i16/i32 value.
|
|
if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
|
|
(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
|
|
// extend_low
|
|
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
|
|
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
|
|
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
|
|
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
|
|
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
|
|
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
|
|
// 2 x extend_low
|
|
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
|
|
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
|
|
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
|
|
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
|
|
// extend_low, extend_high
|
|
{ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
|
|
{ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
|
|
{ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
|
|
{ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
|
|
{ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
|
|
{ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
|
|
// 2x extend_low, extend_high
|
|
{ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
|
|
{ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
|
|
{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
|
|
{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
|
|
// 6x extend_low, extend_high
|
|
{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
|
|
{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
|
|
// shuffle
|
|
{ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
|
|
{ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
|
|
{ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
|
|
{ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
|
|
// narrow, and
|
|
{ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
|
|
{ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
|
|
// narrow, 2x and
|
|
{ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
|
|
// 3x narrow, 4x and
|
|
{ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
|
|
{ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
|
|
// 7x narrow, 8x and
|
|
{ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
|
|
// convert_i32x4
|
|
{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
|
|
{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
|
|
{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
|
|
{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
|
|
// extend_low, convert
|
|
{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
|
|
{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
|
|
{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
|
|
{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
|
|
// extend_low x 2, convert
|
|
{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
|
|
{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
|
|
{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
|
|
{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
|
|
// several shuffles
|
|
{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
|
|
{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
|
|
{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
|
|
{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
|
|
/// trunc_sat, const, and, 3x narrow
|
|
{ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
|
|
{ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
|
|
{ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
|
|
{ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
|
|
/// trunc_sat, const, and, narrow
|
|
{ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
|
|
{ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
|
|
{ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
|
|
{ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
|
|
// 2x trunc_sat, const, 2x and, 3x narrow
|
|
{ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
|
|
{ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
|
|
// 2x trunc_sat, const, 2x and, narrow
|
|
{ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
|
|
{ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
|
|
};
|
|
|
|
if (const auto *Entry =
|
|
ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
|
|
return Entry->Cost;
|
|
}
|
|
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
|
|
}
|
|
|
|
WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
|
|
WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
|
TTI::MemCmpExpansionOptions Options;
|
|
|
|
Options.AllowOverlappingLoads = true;
|
|
|
|
if (ST->hasSIMD128())
|
|
Options.LoadSizes.push_back(16);
|
|
|
|
Options.LoadSizes.append({8, 4, 2, 1});
|
|
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
|
Options.NumLoadsPerBlock = Options.MaxNumLoads;
|
|
|
|
return Options;
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
|
|
unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
|
|
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
|
|
const Instruction *I) const {
|
|
if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
|
|
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
|
|
CostKind);
|
|
}
|
|
|
|
EVT VT = TLI->getValueType(DL, Ty, true);
|
|
// Type legalization can't handle structs
|
|
if (VT == MVT::Other)
|
|
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
|
|
CostKind);
|
|
|
|
auto LT = getTypeLegalizationCost(Ty);
|
|
if (!LT.first.isValid())
|
|
return InstructionCost::getInvalid();
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
unsigned width = VT.getSizeInBits();
|
|
if (ISD == ISD::LOAD) {
|
|
// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
|
|
// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
|
|
// loads are twice as expensive as scalar.
|
|
switch (width) {
|
|
default:
|
|
break;
|
|
case 32:
|
|
case 64:
|
|
case 128:
|
|
return 2;
|
|
}
|
|
} else if (ISD == ISD::STORE) {
|
|
// For stores, we can use store lane operations.
|
|
switch (width) {
|
|
default:
|
|
break;
|
|
case 8:
|
|
case 16:
|
|
case 32:
|
|
case 64:
|
|
case 128:
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getShuffleCost(
|
|
TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
|
|
ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
|
|
VectorType *SubTp, ArrayRef<const Value *> Args,
|
|
const Instruction *CxtI) const {
|
|
// Canonicalize the ShuffleKind in case optimizations didn't.
|
|
// Otherwise, we might end up with the wrong ShuffleKind to match against.
|
|
|
|
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
|
|
|
|
// Wasm SIMD128 has native splat instructions for all lane types.
|
|
if (ST->hasSIMD128() && Kind == TTI::SK_Broadcast &&
|
|
isa<FixedVectorType>(SrcTy))
|
|
return 1;
|
|
|
|
return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
|
|
Args, CxtI);
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
|
|
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
|
|
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
|
|
bool UseMaskForCond, bool UseMaskForGaps) const {
|
|
assert(Factor >= 2 && "Invalid interleave factor");
|
|
|
|
auto *VecTy = cast<VectorType>(Ty);
|
|
if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
|
|
return InstructionCost::getInvalid();
|
|
}
|
|
|
|
if (UseMaskForCond || UseMaskForGaps)
|
|
return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
|
|
Alignment, AddressSpace, CostKind,
|
|
UseMaskForCond, UseMaskForGaps);
|
|
|
|
constexpr unsigned MaxInterleaveFactor = 4;
|
|
if (Factor <= MaxInterleaveFactor) {
|
|
unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
|
|
// Ensure the number of vector elements is greater than 1.
|
|
if (MinElts < 2 || MinElts % Factor != 0)
|
|
return InstructionCost::getInvalid();
|
|
|
|
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
|
|
// Ensure the element type is legal.
|
|
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
|
|
return InstructionCost::getInvalid();
|
|
|
|
if (Factor != 2 && Factor != 4)
|
|
return InstructionCost::getInvalid();
|
|
|
|
auto *SubVecTy =
|
|
VectorType::get(VecTy->getElementType(),
|
|
VecTy->getElementCount().divideCoefficientBy(Factor));
|
|
InstructionCost MemCost =
|
|
getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
|
|
|
|
unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
|
|
unsigned MaxVecSize = 128;
|
|
unsigned NumAccesses =
|
|
std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);
|
|
|
|
// A stride of two is commonly supported via dedicated instructions, so it
|
|
// should be relatively cheap for all element sizes. A stride of four is
|
|
// more expensive as it will likely require more shuffles. Using two
|
|
// simd128 inputs is considered more expensive and we mainly account for
|
|
// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
|
|
// arithmetic kernels.
|
|
static const CostTblEntry ShuffleCostTbl[] = {
|
|
// One reg.
|
|
{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
|
|
{2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
|
|
{2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
|
|
{2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
|
|
{2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
|
|
{2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32
|
|
|
|
// Two regs.
|
|
{2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
|
|
{2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
|
|
{2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32
|
|
|
|
// One reg.
|
|
{4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
|
|
{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
|
|
{4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16
|
|
|
|
// Two regs.
|
|
{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
|
|
{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
|
|
{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
|
|
|
|
// Four regs.
|
|
{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
|
|
};
|
|
|
|
EVT ETy = TLI->getValueType(DL, SubVecTy);
|
|
if (const auto *Entry =
|
|
CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
|
|
return Entry->Cost + (NumAccesses * MemCost);
|
|
}
|
|
|
|
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
|
Alignment, AddressSpace, CostKind,
|
|
UseMaskForCond, UseMaskForGaps);
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
|
|
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
|
|
const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
|
|
InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
|
|
Opcode, Val, CostKind, Index, Op0, Op1, VIC);
|
|
|
|
// SIMD128's insert/extract currently only take constant indices.
|
|
if (Index == -1u)
|
|
return Cost + 25 * TargetTransformInfo::TCC_Expensive;
|
|
|
|
return Cost;
|
|
}
|
|
|
|
InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
|
|
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
|
|
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
|
|
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
|
|
TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
|
|
InstructionCost Invalid = InstructionCost::getInvalid();
|
|
if (!VF.isFixed() || !ST->hasSIMD128())
|
|
return Invalid;
|
|
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return Invalid;
|
|
|
|
if (Opcode != Instruction::Add)
|
|
return Invalid;
|
|
|
|
EVT AccumEVT = EVT::getEVT(AccumType);
|
|
// TODO: Add i64 accumulator.
|
|
if (AccumEVT != MVT::i32)
|
|
return Invalid;
|
|
|
|
// Possible options:
|
|
// - i16x8.extadd_pairwise_i8x16_sx
|
|
// - i32x4.extadd_pairwise_i16x8_sx
|
|
// - i32x4.dot_i16x8_s
|
|
// Only try to support dot, for now.
|
|
|
|
EVT InputEVT = EVT::getEVT(InputTypeA);
|
|
if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
|
|
(InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
|
|
return Invalid;
|
|
}
|
|
|
|
if (OpAExtend == TTI::PR_None)
|
|
return Invalid;
|
|
|
|
InstructionCost Cost(TTI::TCC_Basic);
|
|
if (!BinOp)
|
|
return Cost;
|
|
|
|
if (OpAExtend != OpBExtend)
|
|
return Invalid;
|
|
|
|
if (*BinOp != Instruction::Mul)
|
|
return Invalid;
|
|
|
|
if (InputTypeA != InputTypeB)
|
|
return Invalid;
|
|
|
|
// Signed inputs can lower to dot
|
|
if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
|
|
return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
|
|
|
|
// Double the size of the lowered sequence.
|
|
if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
|
|
return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
|
|
|
|
return Invalid;
|
|
}
|
|
|
|
TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
|
|
const IntrinsicInst *II) const {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
default:
|
|
break;
|
|
case Intrinsic::vector_reduce_fadd:
|
|
return TTI::ReductionShuffle::Pairwise;
|
|
}
|
|
return TTI::ReductionShuffle::SplitHalf;
|
|
}
|
|
|
|
void WebAssemblyTTIImpl::getUnrollingPreferences(
|
|
Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
|
|
OptimizationRemarkEmitter *ORE) const {
|
|
// Scan the loop: don't unroll loops with calls. This is a standard approach
|
|
// for most (all?) targets.
|
|
for (BasicBlock *BB : L->blocks())
|
|
for (Instruction &I : *BB)
|
|
if (isa<CallInst>(I) || isa<InvokeInst>(I))
|
|
if (const Function *F = cast<CallBase>(I).getCalledFunction())
|
|
if (isLoweredToCall(F))
|
|
return;
|
|
|
|
// The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
|
|
// the various microarchitectures that use the BasicTTI implementation and
|
|
// has been selected through heuristics across multiple cores and runtimes.
|
|
UP.Partial = UP.Runtime = UP.UpperBound = true;
|
|
UP.PartialThreshold = 30;
|
|
|
|
// Avoid unrolling when optimizing for size.
|
|
UP.OptSizeThreshold = 0;
|
|
UP.PartialOptSizeThreshold = 0;
|
|
|
|
// Set number of instructions optimized when "back edge"
|
|
// becomes "fall through" to default value of 2.
|
|
UP.BEInsns = 2;
|
|
}
|
|
|
|
bool WebAssemblyTTIImpl::supportsTailCalls() const {
|
|
return getST()->hasTailCall();
|
|
}
|
|
|
|
bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
|
|
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
|
|
using namespace llvm::PatternMatch;
|
|
|
|
if (!I->getType()->isVectorTy() || !I->isShift())
|
|
return false;
|
|
|
|
Value *V = I->getOperand(1);
|
|
// We dont need to sink constant splat.
|
|
if (isa<Constant>(V))
|
|
return false;
|
|
|
|
if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
|
|
m_Value(), m_ZeroMask()))) {
|
|
// Sink insert
|
|
Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
|
|
// Sink shuffle
|
|
Ops.push_back(&I->getOperandUse(1));
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
|
|
/// constant.
|
|
static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
|
|
InstCombiner::BuilderTy &Builder,
|
|
bool IsRelaxed) {
|
|
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
|
|
if (!V)
|
|
return nullptr;
|
|
|
|
auto *VecTy = cast<FixedVectorType>(II.getType());
|
|
unsigned NumElts = VecTy->getNumElements();
|
|
assert(NumElts == 16);
|
|
|
|
// Construct a shuffle mask from constant integers or UNDEFs.
|
|
int Indexes[16];
|
|
bool AnyOutOfBounds = false;
|
|
|
|
for (unsigned I = 0; I < NumElts; ++I) {
|
|
Constant *COp = V->getAggregateElement(I);
|
|
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
|
|
return nullptr;
|
|
|
|
if (isa<UndefValue>(COp)) {
|
|
Indexes[I] = -1;
|
|
continue;
|
|
}
|
|
|
|
if (IsRelaxed && cast<ConstantInt>(COp)->getSExtValue() >= NumElts) {
|
|
// The relaxed_swizzle operation always returns 0 if the lane index is
|
|
// less than 0 when interpreted as a signed value. For lane indices above
|
|
// 15, however, it can choose between returning 0 or the lane at `Index %
|
|
// 16`. However, the choice must be made consistently. As the WebAssembly
|
|
// spec states:
|
|
//
|
|
// "The result of relaxed operators are implementation-dependent, because
|
|
// the set of possible results may depend on properties of the host
|
|
// environment, such as its hardware. Technically, their behaviour is
|
|
// controlled by a set of global parameters to the semantics that an
|
|
// implementation can instantiate in different ways. These choices are
|
|
// fixed, that is, parameters are constant during the execution of any
|
|
// given program."
|
|
//
|
|
// The WebAssembly runtime may choose differently from us, so we can't
|
|
// optimize a relaxed swizzle with lane indices above 15.
|
|
return nullptr;
|
|
}
|
|
|
|
uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
|
|
if (Index >= NumElts) {
|
|
AnyOutOfBounds = true;
|
|
// If there are out-of-bounds indices, the swizzle instruction returns
|
|
// zeroes in those lanes. We'll provide an all-zeroes vector as the
|
|
// second argument to shufflevector and read the first element from it.
|
|
Indexes[I] = NumElts;
|
|
continue;
|
|
}
|
|
|
|
Indexes[I] = Index;
|
|
}
|
|
|
|
auto *V1 = II.getArgOperand(0);
|
|
auto *V2 =
|
|
AnyOutOfBounds ? Constant::getNullValue(VecTy) : PoisonValue::get(VecTy);
|
|
|
|
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
|
|
}
|
|
|
|
std::optional<Instruction *>
|
|
WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
|
|
IntrinsicInst &II) const {
|
|
Intrinsic::ID IID = II.getIntrinsicID();
|
|
switch (IID) {
|
|
case Intrinsic::wasm_swizzle:
|
|
case Intrinsic::wasm_relaxed_swizzle:
|
|
if (Value *V = simplifyWasmSwizzle(
|
|
II, IC.Builder, IID == Intrinsic::wasm_relaxed_swizzle)) {
|
|
return IC.replaceInstUsesWith(II, V);
|
|
}
|
|
break;
|
|
}
|
|
|
|
return std::nullopt;
|
|
}
|