llvm/lib/Target/VE/VEISelLowering.cpp - llvm-project - Git at Google

 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the interfaces that VE uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//

 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
 #include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;

 #define DEBUG_TYPE "ve-lower"

 //===----------------------------------------------------------------------===//
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//

 static bool allocateFloat(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo,
                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
   switch (LocVT.SimpleTy) {
   case MVT::f32: {
     // Allocate stack like below
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     // Use align=8 for dummy area to align the beginning of these 2 area.
     State.AllocateStack(4, 8); // for empty area
     // Use align=4 for value to place it at just after the dummy area.
     unsigned Offset = State.AllocateStack(4, 4); // for float value area
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     return true;
   }
   default:
     return false;
   }
 }

 #include "VEGenCallingConv.inc"

 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   CCAssignFn *RetCC = RetCC_VE;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }

 SDValue
 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool IsVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;

   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_VE);

   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);

   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue OutVal = OutVals[i];

     // Integer return values must be sign or zero extended by the callee.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::ZExt:
       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::AExt:
       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     default:
       llvm_unreachable("Unknown loc info!");
     }

     assert(!VA.needsCustom() && "Unexpected custom lowering");

     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);

     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }

   RetOps[0] = Chain; // Update chain.

   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);

   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
 }

 SDValue VETargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();

   // Get the base offset of the incoming arguments stack space.
   unsigned ArgsBaseOffset = 176;
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 64;

   // Analyze arguments according to CC_VE.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, 8);
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeFormalArguments(Ins, CC_VE);

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     if (VA.isRegLoc()) {
       // This argument is passed in a register.
       // All integer register arguments are promoted by the caller to i64.

       // Create a virtual register for the promoted live-in value.
       unsigned VReg =
           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());

       // Get the high bits for i32 struct elements.
       if (VA.getValVT() == MVT::i32 && VA.needsCustom())
         Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
                           DAG.getConstant(32, DL, MVT::i32));

       // The caller promoted the argument, so insert an Assert?ext SDNode so we
       // won't promote the value again in this function.
       switch (VA.getLocInfo()) {
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       case CCValAssign::ZExt:
         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       default:
         break;
       }

       // Truncate the register down to the argument type.
       if (VA.isExtInLoc())
         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

       InVals.push_back(Arg);
       continue;
     }

     // The registers are exhausted. This argument was passed on the stack.
     assert(VA.isMemLoc());
     // The CC_VE_Full/Half functions compute stack offsets relative to the
     // beginning of the arguments area at %fp+176.
     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
     InVals.push_back(
         DAG.getLoad(VA.getValVT(), DL, Chain,
                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
                     MachinePointerInfo::getFixedStack(MF, FI)));
   }

   if (!IsVarArg)
     return Chain;

   // This function takes variable arguments, some of which may have been passed
   // in registers %s0-%s8.
   //
   // The va_start intrinsic needs to know the offset to the first variable
   // argument.
   // TODO: need to calculate offset correctly once we support f128.
   unsigned ArgOffset = ArgLocs.size() * 8;
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   // Skip the 176 bytes of register save area.
   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);

   return Chain;
 }

 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
                      .Case("sp", VE::SX11)    // Stack pointer
                      .Case("fp", VE::SX9)     // Frame pointer
                      .Case("sl", VE::SX8)     // Stack limit
                      .Case("lr", VE::SX10)    // Link regsiter
                      .Case("tp", VE::SX14)    // Thread pointer
                      .Case("outer", VE::SX12) // Outer regiser
                      .Case("info", VE::SX17)  // Info area register
                      .Case("got", VE::SX15)   // Global offset table register
                      .Case("plt", VE::SX16) // Procedure linkage table register
                      .Default(0);

   if (Reg)
     return Reg;

   report_fatal_error("Invalid register name global variable");
 }

 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//

 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // VE target does not yet support tail call optimization.
   CLI.IsTailCall = false;

   // Get the base offset of the outgoing arguments stack space.
   unsigned ArgsBaseOffset = 176;
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 8 * 8u;

   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, 8);
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);

   // VE requires to use both register and stack for varargs or no-prototyped
   // functions.
   bool UseBoth = CLI.IsVarArg;

   // Analyze operands again if it is required to store BOTH.
   SmallVector<CCValAssign, 16> ArgLocs2;
   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
                   ArgLocs2, *DAG.getContext());
   if (UseBoth)
     CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);

   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();

   // Keep stack frames 16-byte aligned.
   ArgsSize = alignTo(ArgsSize, 16);

   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);

   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
   // instruction.
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

   // Collect chains from all the memory opeations that copy arguments to the
   // stack. They must follow the stack pointer adjustment above and precede the
   // call instruction itself.
   SmallVector<SDValue, 8> MemOpChains;

   // VE needs to get address of callee function in a register
   // So, prepare to copy it to SX12 here.

   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;

   bool IsPICCall = isPositionIndependent();

   // PC-relative references to external symbols should go through $stub.
   // If so, we need to prepare GlobalBaseReg first.
   const TargetMachine &TM = DAG.getTarget();
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)
     GV = CalleeG->getGlobal();
   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
   bool UsePlt = !Local;
   MachineFunction &MF = DAG.getMachineFunction();

   // Turn GlobalAddress/ExternalSymbol node into a value node
   // containing the address of them here.
   if (CalleeG) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   }

   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = CLI.OutVals[i];

     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown location info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     }

     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       if (!UseBoth)
         continue;
       VA = ArgLocs2[i];
     }

     assert(VA.isMemLoc());

     // Create a store off the stack pointer for this argument.
     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
     // The argument area starts at %fp+176 in the callee frame,
     // %sp+176 in ours.
     SDValue PtrOff =
         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
     MemOpChains.push_back(
         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
   }

   // Emit all stores, make sure they occur before the call.
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

   // Build a sequence of CopyToReg nodes glued together with token chain and
   // glue operands which copy the outgoing args into registers. The InGlue is
   // necessary since all emitted instructions must be stuck together in order
   // to pass the live physical registers.
   SDValue InGlue;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
                              RegsToPass[i].second, InGlue);
     InGlue = Chain.getValue(1);
   }

   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));

   // Add a register mask operand representing the call-preserved registers.
   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));

   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
     Ops.push_back(InGlue);

   // Now the call itself.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
   InGlue = Chain.getValue(1);

   // Revert the stack pointer immediately after the call.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
   InGlue = Chain.getValue(1);

   // Now extract the return values. This is more or less the same as
   // LowerFormalArguments.

   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Set inreg flag manually for codegen generated library calls that
   // return float.
   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CS)
     CLI.Ins[0].Flags.setInReg();

   RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE);

   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     unsigned Reg = VA.getLocReg();

     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
     // reside in the same register in the high and low bits. Reuse the
     // CopyFromReg previous node to avoid duplicate copies.
     SDValue RV;
     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
         RV = Chain.getValue(0);

     // But usually we'll create a new CopyFromReg for a different register.
     if (!RV.getNode()) {
       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
       Chain = RV.getValue(1);
       InGlue = Chain.getValue(2);
     }

     // Get the high bits for i32 struct elements.
     if (VA.getValVT() == MVT::i32 && VA.needsCustom())
       RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
                        DAG.getConstant(32, DL, MVT::i32));

     // The callee promoted the return value, so insert an Assert?ext SDNode so
     // we won't promote the value again in this function.
     switch (VA.getLocInfo()) {
     case CCValAssign::SExt:
       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     case CCValAssign::ZExt:
       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     default:
       break;
     }

     // Truncate the register down to the return value type.
     if (VA.isExtInLoc())
       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);

     InVals.push_back(RV);
   }

   return Chain;
 }

 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                     bool ForCodeSize) const {
   return VT == MVT::f32 || VT == MVT::f64;
 }

 /// Determine if the target supports unaligned memory accesses.
 ///
 /// This function returns true if the target allows unaligned memory accesses
 /// of the specified type in the given address space. If true, it also returns
 /// whether the unaligned memory access is "fast" in the last argument by
 /// reference. This is used, for example, in situations where an array
 /// copy/move/set is converted to a sequence of store operations. Its use
 /// helps to ensure that such replacements don't generate code that causes an
 /// alignment error (trap) on the target machine.
 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
                                                       MachineMemOperand::Flags,
                                                       bool *Fast) const {
   if (Fast) {
     // It's fast anytime on VE
     *Fast = true;
   }
   return true;
 }

 VETargetLowering::VETargetLowering(const TargetMachine &TM,
                                    const VESubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   // Instructions which use registers as conditionals examine all the
   // bits (as does the pseudo SELECT_CC expansion). I don't think it
   // matters much whether it's ZeroOrOneBooleanContent, or
   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
   // former.
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent);

   // Set up the register classes.
   addRegisterClass(MVT::i32, &VE::I32RegClass);
   addRegisterClass(MVT::i64, &VE::I64RegClass);
   addRegisterClass(MVT::f32, &VE::F32RegClass);
   addRegisterClass(MVT::f64, &VE::I64RegClass);

   /// Load & Store {
   for (MVT FPVT : MVT::fp_valuetypes()) {
     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
       // Turn FP extload into load/fpextend
       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);

       // Turn FP truncstore into trunc + store.
       setTruncStoreAction(FPVT, OtherFPVT, Expand);
     }
   }

   // VE doesn't have i1 sign extending load
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
   /// } Load & Store

   // Custom legalize address nodes into LO/HI parts.
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);

   /// VAARG handling {
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   // VAARG needs to be lowered to access with 8 bytes alignment.
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   // Use the default implementation.
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   /// } VAARG handling

   /// Int Ops {
   for (MVT IntVT : {MVT::i32, MVT::i64}) {
     // VE has no REM or DIVREM operations.
     setOperationAction(ISD::UREM, IntVT, Expand);
     setOperationAction(ISD::SREM, IntVT, Expand);
     setOperationAction(ISD::SDIVREM, IntVT, Expand);
     setOperationAction(ISD::UDIVREM, IntVT, Expand);

     setOperationAction(ISD::CTTZ, IntVT, Expand);
     setOperationAction(ISD::ROTL, IntVT, Expand);
     setOperationAction(ISD::ROTR, IntVT, Expand);

     // Use isel patterns for i32 and i64
     setOperationAction(ISD::BSWAP, IntVT, Legal);
     setOperationAction(ISD::CTLZ, IntVT, Legal);
     setOperationAction(ISD::CTPOP, IntVT, Legal);

     // Use isel patterns for i64, Promote i32
     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
     setOperationAction(ISD::BITREVERSE, IntVT, Act);
   }
   /// } Int Ops

   /// Conversion {
   // VE doesn't have instructions for fp<->uint, so expand them by llvm
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);

   // fp16 not supported
   for (MVT FPVT : MVT::fp_valuetypes()) {
     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
   }
   /// } Conversion

   setStackPointerRegisterToSaveRestore(VE::SX11);

   // Set function alignment to 16 bytes
   setMinFunctionAlignment(Align(16));

   // VE stores all argument by 8 bytes alignment
   setMinStackArgumentAlignment(Align(8));

   computeRegisterProperties(Subtarget->getRegisterInfo());
 }

 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
 #define TARGET_NODE_CASE(NAME)                                                 \
   case VEISD::NAME:                                                            \
     return "VEISD::" #NAME;
   switch ((VEISD::NodeType)Opcode) {
   case VEISD::FIRST_NUMBER:
     break;
     TARGET_NODE_CASE(Lo)
     TARGET_NODE_CASE(Hi)
     TARGET_NODE_CASE(GETFUNPLT)
     TARGET_NODE_CASE(GETTLSADDR)
     TARGET_NODE_CASE(CALL)
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(GLOBAL_BASE_REG)
   }
 #undef TARGET_NODE_CASE
   return nullptr;
 }

 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
   return MVT::i32;
 }

 // Convert to a target node and set target flags.
 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                           SelectionDAG &DAG) const {
   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
                                       GA->getValueType(0), GA->getOffset(), TF);

   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
                                      0, TF);

   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
                                        TF);

   llvm_unreachable("Unhandled address SDNode");
 }

 // Split Op into high and low parts according to HiTF and LoTF.
 // Return an ADD node combining the parts.
 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                                        SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
 }

 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
 // or ExternalSymbol SDNode.
 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT PtrVT = Op.getValueType();

   // Handle PIC mode first. VE needs a got load for every variable!
   if (isPositionIndependent()) {
     // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
     // function has calls.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     MFI.setHasCalls(true);
     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);

     if (isa<ConstantPoolSDNode>(Op) ||
         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
       // Create following instructions for local linkage PIC code.
       //     lea %s35, %gotoff_lo(.LCPI0_0)
       //     and %s35, %s35, (32)0
       //     lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35)
       //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
       // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
     // Create following instructions for not local linkage PIC code.
     //     lea %s35, %got_lo(.LCPI0_0)
     //     and %s35, %s35, (32)0
     //     lea.sl %s35, %got_hi(.LCPI0_0)(%s35)
     //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
     //     ld     %s35, (,%s35)
     // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }

   // This is one of the absolute code models.
   switch (getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
   case CodeModel::Small:
   case CodeModel::Medium:
   case CodeModel::Large:
     // abs64.
     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
   }
 }

 /// Custom Lower {

 SDValue VETargetLowering::LowerGlobalAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue VETargetLowering::LowerBlockAddress(SDValue Op,
                                             SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue
 VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);

   // Generate the following code:
   //   t1: ch,glue = callseq_start t0, 0, 0
   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
   SDValue Label = withTargetFlags(Op, 0, DAG);
   EVT PtrVT = Op.getValueType();

   // Lowering the machine isd will make sure everything is in the right
   // location.
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
       DAG.getMachineFunction(), CallingConv::C);
   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, dl);
   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
   Chain = DAG.getNode(VEISD::GETTLSADDR, dl, NodeTys, Args);
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, dl, true),
                              DAG.getIntPtrConstant(0, dl, true),
                              Chain.getValue(1), dl);
   Chain = DAG.getCopyFromReg(Chain, dl, VE::SX0, PtrVT, Chain.getValue(1));

   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setHasCalls(true);

   // Also generate code to prepare a GOT register if it is PIC.
   if (isPositionIndependent()) {
     MachineFunction &MF = DAG.getMachineFunction();
     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
   }

   return Chain;
 }

 SDValue VETargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The current implementation of nld (2.26) doesn't allow local exec model
   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
   // generate the general dynamic model code sequence.
   //
   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
   return LowerToTLSGeneralDynamicModel(Op, DAG);
 }

 SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // Need frame address to find the address of VarArgsFrameIndex.
   MF.getFrameInfo().setFrameAddressIsTaken(true);

   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc DL(Op);
   SDValue Offset =
       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }

 SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
   SDValue VAList =
       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
   SDValue Chain = VAList.getValue(1);
   SDValue NextPtr;

   if (VT == MVT::f32) {
     // float --> need special handling like below.
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
     // Then, adjust VAList.
     unsigned InternalOffset = 4;
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(InternalOffset, DL, PtrVT));
   } else {
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
   }

   // Store the incremented VAList to the legalized pointer.
   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));

   // Load the actual argument out of the pointer VAList.
   // We can't count on greater alignment than the word size.
   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
 }

 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Should not custom lower this!");
   case ISD::BlockAddress:
     return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   }
 }
 /// } Custom Lower