Fix more typos in the AArch64 codebase using the https://github.com/crate-ci/typos Rust package. commit-id:33a1bb8d Reviewers: davemgreen Reviewed By: davemgreen Pull Request: https://github.com/llvm/llvm-project/pull/183087
407 lines
15 KiB
C++
407 lines
15 KiB
C++
//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the ARMSelectionDAGInfo class.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "ARMSelectionDAGInfo.h"
|
|
#include "ARMTargetTransformInfo.h"
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#define GET_SDNODE_DESC
|
|
#include "ARMGenSDNodeInfo.inc"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "arm-selectiondag-info"
|
|
|
|
static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
|
|
"arm-memtransfer-tploop", cl::Hidden,
|
|
cl::desc("Control conversion of memcpy to "
|
|
"Tail predicated loops (WLSTP)"),
|
|
cl::init(TPLoop::ForceDisabled),
|
|
cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
|
|
"Don't convert memcpy to TP loop."),
|
|
clEnumValN(TPLoop::ForceEnabled, "force-enabled",
|
|
"Always convert memcpy to TP loop."),
|
|
clEnumValN(TPLoop::Allow, "allow",
|
|
"Allow (may be subject to certain conditions) "
|
|
"conversion of memcpy to TP loop.")));
|
|
|
|
ARMSelectionDAGInfo::ARMSelectionDAGInfo()
|
|
: SelectionDAGGenTargetInfo(ARMGenSDNodeInfo) {}
|
|
|
|
const char *ARMSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
|
|
#define MAKE_CASE(V) \
|
|
case V: \
|
|
return #V;
|
|
|
|
// These nodes don't have corresponding entries in *.td files yet.
|
|
switch (static_cast<ARMISD::NodeType>(Opcode)) {
|
|
MAKE_CASE(ARMISD::DYN_ALLOC)
|
|
MAKE_CASE(ARMISD::MVESEXT)
|
|
MAKE_CASE(ARMISD::MVEZEXT)
|
|
MAKE_CASE(ARMISD::MVETRUNC)
|
|
MAKE_CASE(ARMISD::BUILD_VECTOR)
|
|
MAKE_CASE(ARMISD::VLD1DUP)
|
|
MAKE_CASE(ARMISD::VLD2DUP)
|
|
MAKE_CASE(ARMISD::VLD3DUP)
|
|
MAKE_CASE(ARMISD::VLD4DUP)
|
|
MAKE_CASE(ARMISD::VLD1_UPD)
|
|
MAKE_CASE(ARMISD::VLD2_UPD)
|
|
MAKE_CASE(ARMISD::VLD3_UPD)
|
|
MAKE_CASE(ARMISD::VLD4_UPD)
|
|
MAKE_CASE(ARMISD::VLD1x2_UPD)
|
|
MAKE_CASE(ARMISD::VLD1x3_UPD)
|
|
MAKE_CASE(ARMISD::VLD1x4_UPD)
|
|
MAKE_CASE(ARMISD::VLD2LN_UPD)
|
|
MAKE_CASE(ARMISD::VLD3LN_UPD)
|
|
MAKE_CASE(ARMISD::VLD4LN_UPD)
|
|
MAKE_CASE(ARMISD::VLD1DUP_UPD)
|
|
MAKE_CASE(ARMISD::VLD2DUP_UPD)
|
|
MAKE_CASE(ARMISD::VLD3DUP_UPD)
|
|
MAKE_CASE(ARMISD::VLD4DUP_UPD)
|
|
MAKE_CASE(ARMISD::VST1_UPD)
|
|
MAKE_CASE(ARMISD::VST3_UPD)
|
|
MAKE_CASE(ARMISD::VST1x2_UPD)
|
|
MAKE_CASE(ARMISD::VST1x3_UPD)
|
|
MAKE_CASE(ARMISD::VST1x4_UPD)
|
|
MAKE_CASE(ARMISD::VST2LN_UPD)
|
|
MAKE_CASE(ARMISD::VST3LN_UPD)
|
|
MAKE_CASE(ARMISD::VST4LN_UPD)
|
|
}
|
|
#undef MAKE_CASE
|
|
|
|
return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
|
|
}
|
|
|
|
bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
|
|
// These nodes don't have corresponding entries in *.td files yet.
|
|
if (Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
|
|
Opcode <= ARMISD::LAST_MEMORY_OPCODE)
|
|
return true;
|
|
|
|
return SelectionDAGGenTargetInfo::isTargetMemoryOpcode(Opcode);
|
|
}
|
|
|
|
void ARMSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
|
|
const SDNode *N) const {
|
|
switch (N->getOpcode()) {
|
|
default:
|
|
break;
|
|
case ARMISD::WIN__DBZCHK:
|
|
// invalid number of results; expected 2, got 1
|
|
case ARMISD::WIN__CHKSTK:
|
|
// invalid number of results; expected 1, got 2
|
|
case ARMISD::COPY_STRUCT_BYVAL:
|
|
// invalid number of operands; expected 6, got 5
|
|
case ARMISD::MEMCPY:
|
|
// invalid number of operands; expected 5, got 4
|
|
case ARMISD::VMOVRRD:
|
|
// operand #0 must have type f64, but has type v1i64/v4f16/v8i8
|
|
case ARMISD::VMOVIMM:
|
|
// operand #0 must have type i32, but has type i16
|
|
return;
|
|
}
|
|
|
|
SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
|
|
}
|
|
|
|
// Emit, if possible, a specialized version of the given Libcall. Typically this
|
|
// means selecting the appropriately aligned version, but we also convert memset
|
|
// of 0 into memclr.
|
|
SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
|
|
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
|
SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
|
|
const ARMSubtarget &Subtarget =
|
|
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
|
const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
|
|
|
|
// Only use a specialized AEABI function if the default version of this
|
|
// Libcall is an AEABI function.
|
|
//
|
|
// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
|
|
// able to translate memset to memclr and use the value to index the function
|
|
// name array.
|
|
enum {
|
|
AEABI_MEMCPY = 0,
|
|
AEABI_MEMMOVE,
|
|
AEABI_MEMSET,
|
|
AEABI_MEMCLR
|
|
} AEABILibcall;
|
|
switch (LC) {
|
|
case RTLIB::MEMCPY:
|
|
if (DAG.getLibcalls().getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy)
|
|
return SDValue();
|
|
|
|
AEABILibcall = AEABI_MEMCPY;
|
|
break;
|
|
case RTLIB::MEMMOVE:
|
|
if (DAG.getLibcalls().getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove)
|
|
return SDValue();
|
|
|
|
AEABILibcall = AEABI_MEMMOVE;
|
|
break;
|
|
case RTLIB::MEMSET:
|
|
if (DAG.getLibcalls().getLibcallImpl(LC) != RTLIB::impl___aeabi_memset)
|
|
return SDValue();
|
|
|
|
AEABILibcall = AEABI_MEMSET;
|
|
if (isNullConstant(Src))
|
|
AEABILibcall = AEABI_MEMCLR;
|
|
break;
|
|
default:
|
|
return SDValue();
|
|
}
|
|
|
|
// Choose the most-aligned libcall variant that we can
|
|
enum {
|
|
ALIGN1 = 0,
|
|
ALIGN4,
|
|
ALIGN8
|
|
} AlignVariant;
|
|
if ((Align & 7) == 0)
|
|
AlignVariant = ALIGN8;
|
|
else if ((Align & 3) == 0)
|
|
AlignVariant = ALIGN4;
|
|
else
|
|
AlignVariant = ALIGN1;
|
|
|
|
TargetLowering::ArgListTy Args;
|
|
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
|
|
Args.emplace_back(Dst, IntPtrTy);
|
|
if (AEABILibcall == AEABI_MEMCLR) {
|
|
Args.emplace_back(Size, IntPtrTy);
|
|
} else if (AEABILibcall == AEABI_MEMSET) {
|
|
// Adjust parameters for memset, EABI uses format (ptr, size, value),
|
|
// GNU library uses (ptr, value, size)
|
|
// See RTABI section 4.3.4
|
|
Args.emplace_back(Size, IntPtrTy);
|
|
|
|
// Extend or truncate the argument to be an i32 value for the call.
|
|
if (Src.getValueType().bitsGT(MVT::i32))
|
|
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
|
|
else if (Src.getValueType().bitsLT(MVT::i32))
|
|
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
|
|
|
|
TargetLowering::ArgListEntry Entry(Src,
|
|
Type::getInt32Ty(*DAG.getContext()));
|
|
Entry.IsSExt = false;
|
|
Args.push_back(Entry);
|
|
} else {
|
|
Args.emplace_back(Src, IntPtrTy);
|
|
Args.emplace_back(Size, IntPtrTy);
|
|
}
|
|
|
|
static const RTLIB::Libcall FunctionImpls[4][3] = {
|
|
{RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8},
|
|
{RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8},
|
|
{RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8},
|
|
{RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}};
|
|
|
|
RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant];
|
|
RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(NewLC);
|
|
if (LCImpl == RTLIB::Unsupported)
|
|
return SDValue();
|
|
|
|
TargetLowering::CallLoweringInfo CLI(DAG);
|
|
CLI.setDebugLoc(dl)
|
|
.setChain(Chain)
|
|
.setLibCallee(
|
|
DAG.getLibcalls().getLibcallImplCallingConv(LCImpl),
|
|
Type::getVoidTy(*DAG.getContext()),
|
|
DAG.getExternalSymbol(LCImpl, TLI->getPointerTy(DAG.getDataLayout())),
|
|
std::move(Args))
|
|
.setDiscardResult();
|
|
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
|
|
|
|
return CallResult.second;
|
|
}
|
|
|
|
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
|
|
const SelectionDAG &DAG,
|
|
ConstantSDNode *ConstantSize,
|
|
Align Alignment, bool IsMemcpy) {
|
|
auto &F = DAG.getMachineFunction().getFunction();
|
|
if (!EnableMemtransferTPLoop)
|
|
return false;
|
|
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
|
|
return true;
|
|
// Do not generate inline TP loop if optimizations is disabled,
|
|
// or if optimization for size (-Os or -Oz) is on.
|
|
if (F.hasOptNone() || F.hasOptSize())
|
|
return false;
|
|
// If cli option is unset, for memset always generate inline TP.
|
|
// For memcpy, check some conditions
|
|
if (!IsMemcpy)
|
|
return true;
|
|
if (!ConstantSize && Alignment >= Align(4))
|
|
return true;
|
|
if (ConstantSize &&
|
|
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
|
|
ConstantSize->getZExtValue() <
|
|
Subtarget.getMaxMemcpyTPInlineSizeThreshold())
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
|
|
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
|
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
|
|
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
|
|
const ARMSubtarget &Subtarget =
|
|
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
|
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
|
|
|
if (Subtarget.hasMVEIntegerOps() &&
|
|
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
|
|
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
|
|
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
|
|
|
|
// Do repeated 4-byte loads and stores. To be improved.
|
|
// This requires 4-byte alignment.
|
|
if (Alignment < Align(4))
|
|
return SDValue();
|
|
// This requires the copy size to be a constant, preferably
|
|
// within a subtarget-specific limit.
|
|
if (!ConstantSize)
|
|
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
|
Alignment.value(), RTLIB::MEMCPY);
|
|
uint64_t SizeVal = ConstantSize->getZExtValue();
|
|
if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
|
|
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
|
Alignment.value(), RTLIB::MEMCPY);
|
|
|
|
unsigned BytesLeft = SizeVal & 3;
|
|
unsigned NumMemOps = SizeVal >> 2;
|
|
unsigned EmittedNumMemOps = 0;
|
|
EVT VT = MVT::i32;
|
|
unsigned VTSize = 4;
|
|
unsigned i = 0;
|
|
// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
|
|
const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
|
|
SDValue TFOps[6];
|
|
SDValue Loads[6];
|
|
uint64_t SrcOff = 0, DstOff = 0;
|
|
|
|
// FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
|
|
// VLDM/VSTM and make this code emit it when appropriate. This would reduce
|
|
// pressure on the general purpose registers. However this seems harder to map
|
|
// onto the register allocator's view of the world.
|
|
|
|
// The number of MEMCPY pseudo-instructions to emit. We use up to
|
|
// MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
|
|
// later on. This is a lower bound on the number of MEMCPY operations we must
|
|
// emit.
|
|
unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
|
|
|
|
// Code size optimisation: do not inline memcpy if expansion results in
|
|
// more instructions than the library call.
|
|
if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
|
|
return SDValue();
|
|
}
|
|
|
|
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
|
|
|
|
for (unsigned I = 0; I != NumMEMCPYs; ++I) {
|
|
// Evenly distribute registers among MEMCPY operations to reduce register
|
|
// pressure.
|
|
unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
|
|
unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
|
|
|
|
Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
|
|
DAG.getConstant(NumRegs, dl, MVT::i32));
|
|
Src = Dst.getValue(1);
|
|
Chain = Dst.getValue(2);
|
|
|
|
DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
|
|
SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
|
|
|
|
EmittedNumMemOps = NextEmittedNumMemOps;
|
|
}
|
|
|
|
if (BytesLeft == 0)
|
|
return Chain;
|
|
|
|
// Issue loads / stores for the trailing (1 - 3) bytes.
|
|
auto getRemainingValueType = [](unsigned BytesLeft) {
|
|
return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
|
|
};
|
|
auto getRemainingSize = [](unsigned BytesLeft) {
|
|
return (BytesLeft >= 2) ? 2 : 1;
|
|
};
|
|
|
|
unsigned BytesLeftSave = BytesLeft;
|
|
i = 0;
|
|
while (BytesLeft) {
|
|
VT = getRemainingValueType(BytesLeft);
|
|
VTSize = getRemainingSize(BytesLeft);
|
|
Loads[i] = DAG.getLoad(VT, dl, Chain,
|
|
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
|
|
DAG.getConstant(SrcOff, dl, MVT::i32)),
|
|
SrcPtrInfo.getWithOffset(SrcOff));
|
|
TFOps[i] = Loads[i].getValue(1);
|
|
++i;
|
|
SrcOff += VTSize;
|
|
BytesLeft -= VTSize;
|
|
}
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
|
|
|
|
i = 0;
|
|
BytesLeft = BytesLeftSave;
|
|
while (BytesLeft) {
|
|
VT = getRemainingValueType(BytesLeft);
|
|
VTSize = getRemainingSize(BytesLeft);
|
|
TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
|
|
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
|
|
DAG.getConstant(DstOff, dl, MVT::i32)),
|
|
DstPtrInfo.getWithOffset(DstOff));
|
|
++i;
|
|
DstOff += VTSize;
|
|
BytesLeft -= VTSize;
|
|
}
|
|
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
|
|
}
|
|
|
|
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
|
|
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
|
SDValue Size, Align Alignment, bool isVolatile,
|
|
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
|
|
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
|
Alignment.value(), RTLIB::MEMMOVE);
|
|
}
|
|
|
|
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
|
|
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
|
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
|
|
MachinePointerInfo DstPtrInfo) const {
|
|
|
|
const ARMSubtarget &Subtarget =
|
|
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
|
|
|
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
|
|
|
// Generate TP loop for llvm.memset
|
|
if (Subtarget.hasMVEIntegerOps() &&
|
|
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
|
|
false)) {
|
|
Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
|
|
DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
|
|
return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
|
|
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
|
|
}
|
|
|
|
if (!AlwaysInline)
|
|
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
|
Alignment.value(), RTLIB::MEMSET);
|
|
|
|
return SDValue();
|
|
}
|