llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp - llvm-project - Git at Google

 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the ARMSelectionDAGInfo class.
 //
 //===----------------------------------------------------------------------===//

 #include "ARMTargetMachine.h"
 #include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;

 #define DEBUG_TYPE "arm-selectiondag-info"

 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
     "arm-memtransfer-tploop", cl::Hidden,
     cl::desc("Control conversion of memcpy to "
              "Tail predicated loops (WLSTP)"),
     cl::init(TPLoop::ForceDisabled),
     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
                           "Don't convert memcpy to TP loop."),
                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
                           "Always convert memcpy to TP loop."),
                clEnumValN(TPLoop::Allow, "allow",
                           "Allow (may be subject to certain conditions) "
                           "conversion of memcpy to TP loop.")));

 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();

   // Only use a specialized AEABI function if the default version of this
   // Libcall is an AEABI function.
   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
     return SDValue();

   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
   // able to translate memset to memclr and use the value to index the function
   // name array.
   enum {
     AEABI_MEMCPY = 0,
     AEABI_MEMMOVE,
     AEABI_MEMSET,
     AEABI_MEMCLR
   } AEABILibcall;
   switch (LC) {
   case RTLIB::MEMCPY:
     AEABILibcall = AEABI_MEMCPY;
     break;
   case RTLIB::MEMMOVE:
     AEABILibcall = AEABI_MEMMOVE;
     break;
   case RTLIB::MEMSET:
     AEABILibcall = AEABI_MEMSET;
     if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
       if (ConstantSrc->getZExtValue() == 0)
         AEABILibcall = AEABI_MEMCLR;
     break;
   default:
     return SDValue();
   }

   // Choose the most-aligned libcall variant that we can
   enum {
     ALIGN1 = 0,
     ALIGN4,
     ALIGN8
   } AlignVariant;
   if ((Align & 7) == 0)
     AlignVariant = ALIGN8;
   else if ((Align & 3) == 0)
     AlignVariant = ALIGN4;
   else
     AlignVariant = ALIGN1;

   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
   if (AEABILibcall == AEABI_MEMCLR) {
     Entry.Node = Size;
     Args.push_back(Entry);
   } else if (AEABILibcall == AEABI_MEMSET) {
     // Adjust parameters for memset, EABI uses format (ptr, size, value),
     // GNU library uses (ptr, value, size)
     // See RTABI section 4.3.4
     Entry.Node = Size;
     Args.push_back(Entry);

     // Extend or truncate the argument to be an i32 value for the call.
     if (Src.getValueType().bitsGT(MVT::i32))
       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
     else if (Src.getValueType().bitsLT(MVT::i32))
       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);

     Entry.Node = Src;
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
     Entry.IsSExt = false;
     Args.push_back(Entry);
   } else {
     Entry.Node = Src;
     Args.push_back(Entry);

     Entry.Node = Size;
     Args.push_back(Entry);
   }

   char const *FunctionNames[4][3] = {
     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
   };
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(
           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
                                 TLI->getPointerTy(DAG.getDataLayout())),
           std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);

   return CallResult.second;
 }

 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
                                        const SelectionDAG &DAG,
                                        ConstantSDNode *ConstantSize,
                                        Align Alignment, bool IsMemcpy) {
   auto &F = DAG.getMachineFunction().getFunction();
   if (!EnableMemtransferTPLoop)
     return false;
   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
     return true;
   // Do not generate inline TP loop if optimizations is disabled,
   // or if optimization for size (-Os or -Oz) is on.
   if (F.hasOptNone() || F.hasOptSize())
     return false;
   // If cli option is unset, for memset always generate inline TP.
   // For memcpy, check some conditions
   if (!IsMemcpy)
     return true;
   if (!ConstantSize && Alignment >= Align(4))
     return true;
   if (ConstantSize &&
       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
       ConstantSize->getZExtValue() <
           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
     return true;
   return false;
 }

 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

   if (Subtarget.hasMVEIntegerOps() &&
       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));

   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if (Alignment < Align(4))
     return SDValue();
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   if (!ConstantSize)
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   Alignment.value(), RTLIB::MEMCPY);
   uint64_t SizeVal = ConstantSize->getZExtValue();
   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   Alignment.value(), RTLIB::MEMCPY);

   unsigned BytesLeft = SizeVal & 3;
   unsigned NumMemOps = SizeVal >> 2;
   unsigned EmittedNumMemOps = 0;
   EVT VT = MVT::i32;
   unsigned VTSize = 4;
   unsigned i = 0;
   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
   SDValue TFOps[6];
   SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;

   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
   // pressure on the general purpose registers. However this seems harder to map
   // onto the register allocator's view of the world.

   // The number of MEMCPY pseudo-instructions to emit. We use up to
   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
   // later on. This is a lower bound on the number of MEMCPY operations we must
   // emit.
   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;

   // Code size optimisation: do not inline memcpy if expansion results in
   // more instructions than the libary call.
   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
     return SDValue();
   }

   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);

   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
     // Evenly distribute registers among MEMCPY operations to reduce register
     // pressure.
     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;

     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
                       DAG.getConstant(NumRegs, dl, MVT::i32));
     Src = Dst.getValue(1);
     Chain = Dst.getValue(2);

     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);

     EmittedNumMemOps = NextEmittedNumMemOps;
   }

   if (BytesLeft == 0)
     return Chain;

   // Issue loads / stores for the trailing (1 - 3) bytes.
   auto getRemainingValueType = [](unsigned BytesLeft) {
     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
   };
   auto getRemainingSize = [](unsigned BytesLeft) {
     return (BytesLeft >= 2) ? 2 : 1;
   };

   unsigned BytesLeftSave = BytesLeft;
   i = 0;
   while (BytesLeft) {
     VT = getRemainingValueType(BytesLeft);
     VTSize = getRemainingSize(BytesLeft);
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
                            SrcPtrInfo.getWithOffset(SrcOff));
     TFOps[i] = Loads[i].getValue(1);
     ++i;
     SrcOff += VTSize;
     BytesLeft -= VTSize;
   }
   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                       makeArrayRef(TFOps, i));

   i = 0;
   BytesLeft = BytesLeftSave;
   while (BytesLeft) {
     VT = getRemainingValueType(BytesLeft);
     VTSize = getRemainingSize(BytesLeft);
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
                             DstPtrInfo.getWithOffset(DstOff));
     ++i;
     DstOff += VTSize;
     BytesLeft -= VTSize;
   }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                      makeArrayRef(TFOps, i));
 }

 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                 Alignment.value(), RTLIB::MEMMOVE);
 }

 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {

   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();

   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

   // Generate TP loop for llvm.memset
   if (Subtarget.hasMVEIntegerOps() &&
       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
                                  false)) {
     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
   }

   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                 Alignment.value(), RTLIB::MEMSET);
 }
	//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the ARMSelectionDAGInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "ARMTargetMachine.h"
	#include "ARMTargetTransformInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/Support/CommandLine.h"
	using namespace llvm;

	#define DEBUG_TYPE "arm-selectiondag-info"

	cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
	"arm-memtransfer-tploop", cl::Hidden,
	cl::desc("Control conversion of memcpy to "
	"Tail predicated loops (WLSTP)"),
	cl::init(TPLoop::ForceDisabled),
	cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
	"Don't convert memcpy to TP loop."),
	clEnumValN(TPLoop::ForceEnabled, "force-enabled",
	"Always convert memcpy to TP loop."),
	clEnumValN(TPLoop::Allow, "allow",
	"Allow (may be subject to certain conditions) "
	"conversion of memcpy to TP loop.")));

	// Emit, if possible, a specialized version of the given Libcall. Typically this
	// means selecting the appropriately aligned version, but we also convert memset
	// of 0 into memclr.
	SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
	const ARMSubtarget &Subtarget =
	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
	const ARMTargetLowering *TLI = Subtarget.getTargetLowering();

	// Only use a specialized AEABI function if the default version of this
	// Libcall is an AEABI function.
	if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
	return SDValue();

	// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
	// able to translate memset to memclr and use the value to index the function
	// name array.
	enum {
	AEABI_MEMCPY = 0,
	AEABI_MEMMOVE,
	AEABI_MEMSET,
	AEABI_MEMCLR
	} AEABILibcall;
	switch (LC) {
	case RTLIB::MEMCPY:
	AEABILibcall = AEABI_MEMCPY;
	break;
	case RTLIB::MEMMOVE:
	AEABILibcall = AEABI_MEMMOVE;
	break;
	case RTLIB::MEMSET:
	AEABILibcall = AEABI_MEMSET;
	if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
	if (ConstantSrc->getZExtValue() == 0)
	AEABILibcall = AEABI_MEMCLR;
	break;
	default:
	return SDValue();
	}

	// Choose the most-aligned libcall variant that we can
	enum {
	ALIGN1 = 0,
	ALIGN4,
	ALIGN8
	} AlignVariant;
	if ((Align & 7) == 0)
	AlignVariant = ALIGN8;
	else if ((Align & 3) == 0)
	AlignVariant = ALIGN4;
	else
	AlignVariant = ALIGN1;

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);
	if (AEABILibcall == AEABI_MEMCLR) {
	Entry.Node = Size;
	Args.push_back(Entry);
	} else if (AEABILibcall == AEABI_MEMSET) {
	// Adjust parameters for memset, EABI uses format (ptr, size, value),
	// GNU library uses (ptr, value, size)
	// See RTABI section 4.3.4
	Entry.Node = Size;
	Args.push_back(Entry);

	// Extend or truncate the argument to be an i32 value for the call.
	if (Src.getValueType().bitsGT(MVT::i32))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
	else if (Src.getValueType().bitsLT(MVT::i32))
	Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);

	Entry.Node = Src;
	Entry.Ty = Type::getInt32Ty(*DAG.getContext());
	Entry.IsSExt = false;
	Args.push_back(Entry);
	} else {
	Entry.Node = Src;
	Args.push_back(Entry);

	Entry.Node = Size;
	Args.push_back(Entry);
	}

	char const *FunctionNames[4][3] = {
	{ "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
	{ "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
	{ "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
	{ "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
	};
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(
	TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
	TLI->getPointerTy(DAG.getDataLayout())),
	std::move(Args))
	.setDiscardResult();
	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);

	return CallResult.second;
	}

	static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
	const SelectionDAG &DAG,
	ConstantSDNode *ConstantSize,
	Align Alignment, bool IsMemcpy) {
	auto &F = DAG.getMachineFunction().getFunction();
	if (!EnableMemtransferTPLoop)
	return false;
	if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
	return true;
	// Do not generate inline TP loop if optimizations is disabled,
	// or if optimization for size (-Os or -Oz) is on.
	if (F.hasOptNone() \|\| F.hasOptSize())
	return false;
	// If cli option is unset, for memset always generate inline TP.
	// For memcpy, check some conditions
	if (!IsMemcpy)
	return true;
	if (!ConstantSize && Alignment >= Align(4))
	return true;
	if (ConstantSize &&
	ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
	ConstantSize->getZExtValue() <
	Subtarget.getMaxMemcpyTPInlineSizeThreshold())
	return true;
	return false;
	}

	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
	SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
	const ARMSubtarget &Subtarget =
	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

	if (Subtarget.hasMVEIntegerOps() &&
	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
	return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
	DAG.getZExtOrTrunc(Size, dl, MVT::i32));

	// Do repeated 4-byte loads and stores. To be improved.
	// This requires 4-byte alignment.
	if (Alignment < Align(4))
	return SDValue();
	// This requires the copy size to be a constant, preferably
	// within a subtarget-specific limit.
	if (!ConstantSize)
	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
	Alignment.value(), RTLIB::MEMCPY);
	uint64_t SizeVal = ConstantSize->getZExtValue();
	if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
	Alignment.value(), RTLIB::MEMCPY);

	unsigned BytesLeft = SizeVal & 3;
	unsigned NumMemOps = SizeVal >> 2;
	unsigned EmittedNumMemOps = 0;
	EVT VT = MVT::i32;
	unsigned VTSize = 4;
	unsigned i = 0;
	// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
	const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
	SDValue TFOps[6];
	SDValue Loads[6];
	uint64_t SrcOff = 0, DstOff = 0;

	// FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
	// VLDM/VSTM and make this code emit it when appropriate. This would reduce
	// pressure on the general purpose registers. However this seems harder to map
	// onto the register allocator's view of the world.

	// The number of MEMCPY pseudo-instructions to emit. We use up to
	// MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
	// later on. This is a lower bound on the number of MEMCPY operations we must
	// emit.
	unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;

	// Code size optimisation: do not inline memcpy if expansion results in
	// more instructions than the libary call.
	if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
	return SDValue();
	}

	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);

	for (unsigned I = 0; I != NumMEMCPYs; ++I) {
	// Evenly distribute registers among MEMCPY operations to reduce register
	// pressure.
	unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
	unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;

	Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
	DAG.getConstant(NumRegs, dl, MVT::i32));
	Src = Dst.getValue(1);
	Chain = Dst.getValue(2);

	DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
	SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);

	EmittedNumMemOps = NextEmittedNumMemOps;
	}

	if (BytesLeft == 0)
	return Chain;

	// Issue loads / stores for the trailing (1 - 3) bytes.
	auto getRemainingValueType = [](unsigned BytesLeft) {
	return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
	};
	auto getRemainingSize = [](unsigned BytesLeft) {
	return (BytesLeft >= 2) ? 2 : 1;
	};

	unsigned BytesLeftSave = BytesLeft;
	i = 0;
	while (BytesLeft) {
	VT = getRemainingValueType(BytesLeft);
	VTSize = getRemainingSize(BytesLeft);
	Loads[i] = DAG.getLoad(VT, dl, Chain,
	DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
	DAG.getConstant(SrcOff, dl, MVT::i32)),
	SrcPtrInfo.getWithOffset(SrcOff));
	TFOps[i] = Loads[i].getValue(1);
	++i;
	SrcOff += VTSize;
	BytesLeft -= VTSize;
	}
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(TFOps, i));

	i = 0;
	BytesLeft = BytesLeftSave;
	while (BytesLeft) {
	VT = getRemainingValueType(BytesLeft);
	VTSize = getRemainingSize(BytesLeft);
	TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
	DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
	DAG.getConstant(DstOff, dl, MVT::i32)),
	DstPtrInfo.getWithOffset(DstOff));
	++i;
	DstOff += VTSize;
	BytesLeft -= VTSize;
	}
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(TFOps, i));
	}

	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
	SDValue Size, Align Alignment, bool isVolatile,
	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
	Alignment.value(), RTLIB::MEMMOVE);
	}

	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
	SDValue Size, Align Alignment, bool isVolatile,
	MachinePointerInfo DstPtrInfo) const {

	const ARMSubtarget &Subtarget =
	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();

	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

	// Generate TP loop for llvm.memset
	if (Subtarget.hasMVEIntegerOps() &&
	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
	false)) {
	Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
	DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
	return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
	DAG.getZExtOrTrunc(Size, dl, MVT::i32));
	}

	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
	Alignment.value(), RTLIB::MEMSET);
	}