llvm/lib/Target/VE/VEISelLowering.cpp - llvm-project - Git at Google

 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the interfaces that VE uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//

 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
 #include "VEInstrBuilder.h"
 #include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;

 #define DEBUG_TYPE "ve-lower"

 //===----------------------------------------------------------------------===//
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//

 #include "VEGenCallingConv.inc"

 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
   switch (CallConv) {
   default:
     return RetCC_VE_C;
   case CallingConv::Fast:
     return RetCC_VE_Fast;
   }
 }

 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
   if (IsVarArg)
     return CC_VE2;
   switch (CallConv) {
   default:
     return CC_VE_C;
   case CallingConv::Fast:
     return CC_VE_Fast;
   }
 }

 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   CCAssignFn *RetCC = getReturnCC(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }

 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};

 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};

 void VETargetLowering::initRegisterClasses() {
   // Set up the register classes.
   addRegisterClass(MVT::i32, &VE::I32RegClass);
   addRegisterClass(MVT::i64, &VE::I64RegClass);
   addRegisterClass(MVT::f32, &VE::F32RegClass);
   addRegisterClass(MVT::f64, &VE::I64RegClass);
   addRegisterClass(MVT::f128, &VE::F128RegClass);

   if (Subtarget->enableVPU()) {
     for (MVT VecVT : AllVectorVTs)
       addRegisterClass(VecVT, &VE::V64RegClass);
     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
   }
 }

 void VETargetLowering::initSPUActions() {
   const auto &TM = getTargetMachine();
   /// Load & Store {

   // VE doesn't have i1 sign extending load.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }

   // VE doesn't have floating point extload/truncstore, so expand them.
   for (MVT FPVT : MVT::fp_valuetypes()) {
     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
       setTruncStoreAction(FPVT, OtherFPVT, Expand);
     }
   }

   // VE doesn't have fp128 load/store, so expand them in custom lower.
   setOperationAction(ISD::LOAD, MVT::f128, Custom);
   setOperationAction(ISD::STORE, MVT::f128, Custom);

   /// } Load & Store

   // Custom legalize address nodes into LO/HI parts.
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
   setOperationAction(ISD::JumpTable, PtrVT, Custom);

   /// VAARG handling {
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   // VAARG needs to be lowered to access with 8 bytes alignment.
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   // Use the default implementation.
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   /// } VAARG handling

   /// Stack {
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);

   // Use the default implementation.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   /// } Stack

   /// Branch {

   // VE doesn't have BRCOND
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);

   // BR_JT is not implemented yet.
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);

   /// } Branch

   /// Int Ops {
   for (MVT IntVT : {MVT::i32, MVT::i64}) {
     // VE has no REM or DIVREM operations.
     setOperationAction(ISD::UREM, IntVT, Expand);
     setOperationAction(ISD::SREM, IntVT, Expand);
     setOperationAction(ISD::SDIVREM, IntVT, Expand);
     setOperationAction(ISD::UDIVREM, IntVT, Expand);

     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);

     // VE has no MULHU/S or U/SMUL_LOHI operations.
     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
     setOperationAction(ISD::MULHU, IntVT, Expand);
     setOperationAction(ISD::MULHS, IntVT, Expand);
     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);

     // VE has no CTTZ, ROTL, ROTR operations.
     setOperationAction(ISD::CTTZ, IntVT, Expand);
     setOperationAction(ISD::ROTL, IntVT, Expand);
     setOperationAction(ISD::ROTR, IntVT, Expand);

     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
     // instruction works fine as i32 BSWAP operation with an additional
     // parameter.  Use isel patterns to lower BSWAP.
     setOperationAction(ISD::BSWAP, IntVT, Legal);

     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
     // operations.  Use isel patterns for i64, promote for i32.
     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
     setOperationAction(ISD::BITREVERSE, IntVT, Act);
     setOperationAction(ISD::CTLZ, IntVT, Act);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
     setOperationAction(ISD::CTPOP, IntVT, Act);

     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
     // Use isel patterns for i64, promote for i32.
     setOperationAction(ISD::AND, IntVT, Act);
     setOperationAction(ISD::OR, IntVT, Act);
     setOperationAction(ISD::XOR, IntVT, Act);
   }
   /// } Int Ops

   /// Conversion {
   // VE doesn't have instructions for fp<->uint, so expand them by llvm
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);

   // fp16 not supported
   for (MVT FPVT : MVT::fp_valuetypes()) {
     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
   }
   /// } Conversion

   /// Floating-point Ops {
   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
   ///       and fcmp.

   // VE doesn't have following floating point operations.
   for (MVT VT : MVT::fp_valuetypes()) {
     setOperationAction(ISD::FNEG, VT, Expand);
     setOperationAction(ISD::FREM, VT, Expand);
   }

   // VE doesn't have fdiv of f128.
   setOperationAction(ISD::FDIV, MVT::f128, Expand);

   for (MVT FPVT : {MVT::f32, MVT::f64}) {
     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
     setOperationAction(ISD::ConstantFP, FPVT, Legal);
   }
   /// } Floating-point Ops

   /// Floating-point math functions {

   // VE doesn't have following floating point math functions.
   for (MVT VT : MVT::fp_valuetypes()) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
   }

   /// } Floating-point math functions

   /// Atomic instructions {

   setMaxAtomicSizeInBitsSupported(64);
   setMinCmpXchgSizeInBits(32);
   setSupportsUnalignedAtomics(false);

   // Use custom inserter for ATOMIC_FENCE.
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);

   // Other atomic instructions.
   for (MVT VT : MVT::integer_valuetypes()) {
     // Support i8/i16 atomic swap.
     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);

     // FIXME: Support "atmam" instructions.
     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);

     // VE doesn't have follwing instructions.
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
   }

   /// } Atomic instructions

   /// SJLJ instructions {
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
   /// } SJLJ instructions

   // Intrinsic instructions
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 }

 void VETargetLowering::initVPUActions() {
   for (MVT LegalVecVT : AllVectorVTs) {
     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
     // Translate all vector instructions with legal element types to VVP_*
     // nodes.
     // TODO We will custom-widen into VVP_* nodes in the future. While we are
     // buildling the infrastructure for this, we only do this for legal vector
     // VTs.
 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
 #include "VVPNodes.def"
   }

   for (MVT LegalPackedVT : AllPackedVTs) {
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
   }
 }

 SDValue
 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool IsVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;

   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));

   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);

   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     assert(!VA.needsCustom() && "Unexpected custom lowering");
     SDValue OutVal = OutVals[i];

     // Integer return values must be sign or zero extended by the callee.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::ZExt:
       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::AExt:
       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
     case CCValAssign::BCvt: {
       // Convert a float return value to i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Undef = SDValue(
           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                           MVT::i64, Undef, OutVal, Sub_f32),
                        0);
       break;
     }
     default:
       llvm_unreachable("Unknown loc info!");
     }

     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);

     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }

   RetOps[0] = Chain; // Update chain.

   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);

   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
 }

 SDValue VETargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();

   // Get the base offset of the incoming arguments stack space.
   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 64;

   // Analyze arguments according to CC_VE.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     assert(!VA.needsCustom() && "Unexpected custom lowering");
     if (VA.isRegLoc()) {
       // This argument is passed in a register.
       // All integer register arguments are promoted by the caller to i64.

       // Create a virtual register for the promoted live-in value.
       unsigned VReg =
           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());

       // The caller promoted the argument, so insert an Assert?ext SDNode so we
       // won't promote the value again in this function.
       switch (VA.getLocInfo()) {
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       case CCValAssign::ZExt:
         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
       case CCValAssign::BCvt: {
         // Extract a float argument from i64 with padding.
         //     63     31   0
         //    +------+------+
         //    | float|   0  |
         //    +------+------+
         assert(VA.getLocVT() == MVT::i64);
         assert(VA.getValVT() == MVT::f32);
         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                          MVT::f32, Arg, Sub_f32),
                       0);
         break;
       }
       default:
         break;
       }

       // Truncate the register down to the argument type.
       if (VA.isExtInLoc())
         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

       InVals.push_back(Arg);
       continue;
     }

     // The registers are exhausted. This argument was passed on the stack.
     assert(VA.isMemLoc());
     // The CC_VE_Full/Half functions compute stack offsets relative to the
     // beginning of the arguments area at %fp + the size of reserved area.
     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;

     // Adjust offset for a float argument by adding 4 since the argument is
     // stored in 8 bytes buffer with offset like below.  LLVM generates
     // 4 bytes load instruction, so need to adjust offset here.  This
     // adjustment is required in only LowerFormalArguments.  In LowerCall,
     // a float argument is converted to i64 first, and stored as 8 bytes
     // data, which is required by ABI, so no need for adjustment.
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     if (VA.getValVT() == MVT::f32)
       Offset += 4;

     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
     InVals.push_back(
         DAG.getLoad(VA.getValVT(), DL, Chain,
                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
                     MachinePointerInfo::getFixedStack(MF, FI)));
   }

   if (!IsVarArg)
     return Chain;

   // This function takes variable arguments, some of which may have been passed
   // in registers %s0-%s8.
   //
   // The va_start intrinsic needs to know the offset to the first variable
   // argument.
   // TODO: need to calculate offset correctly once we support f128.
   unsigned ArgOffset = ArgLocs.size() * 8;
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   // Skip the reserved area at the top of stack.
   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);

   return Chain;
 }

 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
                      .Case("sp", VE::SX11)    // Stack pointer
                      .Case("fp", VE::SX9)     // Frame pointer
                      .Case("sl", VE::SX8)     // Stack limit
                      .Case("lr", VE::SX10)    // Link register
                      .Case("tp", VE::SX14)    // Thread pointer
                      .Case("outer", VE::SX12) // Outer regiser
                      .Case("info", VE::SX17)  // Info area register
                      .Case("got", VE::SX15)   // Global offset table register
                      .Case("plt", VE::SX16) // Procedure linkage table register
                      .Default(0);

   if (Reg)
     return Reg;

   report_fatal_error("Invalid register name global variable");
 }

 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//

 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // VE target does not yet support tail call optimization.
   CLI.IsTailCall = false;

   // Get the base offset of the outgoing arguments stack space.
   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 8 * 8u;

   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   // Allocate the preserved area first.
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));

   // VE requires to use both register and stack for varargs or no-prototyped
   // functions.
   bool UseBoth = CLI.IsVarArg;

   // Analyze operands again if it is required to store BOTH.
   SmallVector<CCValAssign, 16> ArgLocs2;
   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
                   ArgLocs2, *DAG.getContext());
   if (UseBoth)
     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));

   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();

   // Keep stack frames 16-byte aligned.
   ArgsSize = alignTo(ArgsSize, 16);

   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);

   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
   // instruction.
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

   // Collect chains from all the memory opeations that copy arguments to the
   // stack. They must follow the stack pointer adjustment above and precede the
   // call instruction itself.
   SmallVector<SDValue, 8> MemOpChains;

   // VE needs to get address of callee function in a register
   // So, prepare to copy it to SX12 here.

   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;

   bool IsPICCall = isPositionIndependent();

   // PC-relative references to external symbols should go through $stub.
   // If so, we need to prepare GlobalBaseReg first.
   const TargetMachine &TM = DAG.getTarget();
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)
     GV = CalleeG->getGlobal();
   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
   bool UsePlt = !Local;
   MachineFunction &MF = DAG.getMachineFunction();

   // Turn GlobalAddress/ExternalSymbol node into a value node
   // containing the address of them here.
   if (CalleeG) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsPICCall) {
       if (UsePlt)
         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
       Callee =
           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
     }
   }

   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));

   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = CLI.OutVals[i];

     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown location info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt: {
       // Convert a float argument to i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Undef = SDValue(
           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                        MVT::i64, Undef, Arg, Sub_f32),
                     0);
       break;
     }
     }

     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       if (!UseBoth)
         continue;
       VA = ArgLocs2[i];
     }

     assert(VA.isMemLoc());

     // Create a store off the stack pointer for this argument.
     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
     // The argument area starts at %fp/%sp + the size of reserved area.
     SDValue PtrOff =
         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
     MemOpChains.push_back(
         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
   }

   // Emit all stores, make sure they occur before the call.
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

   // Build a sequence of CopyToReg nodes glued together with token chain and
   // glue operands which copy the outgoing args into registers. The InGlue is
   // necessary since all emitted instructions must be stuck together in order
   // to pass the live physical registers.
   SDValue InGlue;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
                              RegsToPass[i].second, InGlue);
     InGlue = Chain.getValue(1);
   }

   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));

   // Add a register mask operand representing the call-preserved registers.
   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));

   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
     Ops.push_back(InGlue);

   // Now the call itself.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
   InGlue = Chain.getValue(1);

   // Revert the stack pointer immediately after the call.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
   InGlue = Chain.getValue(1);

   // Now extract the return values. This is more or less the same as
   // LowerFormalArguments.

   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());

   // Set inreg flag manually for codegen generated library calls that
   // return float.
   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
     CLI.Ins[0].Flags.setInReg();

   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));

   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(!VA.needsCustom() && "Unexpected custom lowering");
     unsigned Reg = VA.getLocReg();

     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
     // reside in the same register in the high and low bits. Reuse the
     // CopyFromReg previous node to avoid duplicate copies.
     SDValue RV;
     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
         RV = Chain.getValue(0);

     // But usually we'll create a new CopyFromReg for a different register.
     if (!RV.getNode()) {
       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
       Chain = RV.getValue(1);
       InGlue = Chain.getValue(2);
     }

     // The callee promoted the return value, so insert an Assert?ext SDNode so
     // we won't promote the value again in this function.
     switch (VA.getLocInfo()) {
     case CCValAssign::SExt:
       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     case CCValAssign::ZExt:
       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
     case CCValAssign::BCvt: {
       // Extract a float return value from i64 with padding.
       //     63     31   0
       //    +------+------+
       //    | float|   0  |
       //    +------+------+
       assert(VA.getLocVT() == MVT::i64);
       assert(VA.getValVT() == MVT::f32);
       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                       MVT::f32, RV, Sub_f32),
                    0);
       break;
     }
     default:
       break;
     }

     // Truncate the register down to the return value type.
     if (VA.isExtInLoc())
       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);

     InVals.push_back(RV);
   }

   return Chain;
 }

 bool VETargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
   // VE uses 64 bit addressing, so we need multiple instructions to generate
   // an address.  Folding address with offset increases the number of
   // instructions, so that we disable it here.  Offsets will be folded in
   // the DAG combine later if it worth to do so.
   return false;
 }

 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                     bool ForCodeSize) const {
   return VT == MVT::f32 || VT == MVT::f64;
 }

 /// Determine if the target supports unaligned memory accesses.
 ///
 /// This function returns true if the target allows unaligned memory accesses
 /// of the specified type in the given address space. If true, it also returns
 /// whether the unaligned memory access is "fast" in the last argument by
 /// reference. This is used, for example, in situations where an array
 /// copy/move/set is converted to a sequence of store operations. Its use
 /// helps to ensure that such replacements don't generate code that causes an
 /// alignment error (trap) on the target machine.
 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       Align A,
                                                       MachineMemOperand::Flags,
                                                       bool *Fast) const {
   if (Fast) {
     // It's fast anytime on VE
     *Fast = true;
   }
   return true;
 }

 VETargetLowering::VETargetLowering(const TargetMachine &TM,
                                    const VESubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   // Instructions which use registers as conditionals examine all the
   // bits (as does the pseudo SELECT_CC expansion). I don't think it
   // matters much whether it's ZeroOrOneBooleanContent, or
   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
   // former.
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent);

   initRegisterClasses();
   initSPUActions();
   initVPUActions();

   setStackPointerRegisterToSaveRestore(VE::SX11);

   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::TRUNCATE);

   // Set function alignment to 16 bytes
   setMinFunctionAlignment(Align(16));

   // VE stores all argument by 8 bytes alignment
   setMinStackArgumentAlignment(Align(8));

   computeRegisterProperties(Subtarget->getRegisterInfo());
 }

 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
 #define TARGET_NODE_CASE(NAME)                                                 \
   case VEISD::NAME:                                                            \
     return "VEISD::" #NAME;
   switch ((VEISD::NodeType)Opcode) {
   case VEISD::FIRST_NUMBER:
     break;
     TARGET_NODE_CASE(CALL)
     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
     TARGET_NODE_CASE(GETFUNPLT)
     TARGET_NODE_CASE(GETSTACKTOP)
     TARGET_NODE_CASE(GETTLSADDR)
     TARGET_NODE_CASE(GLOBAL_BASE_REG)
     TARGET_NODE_CASE(Hi)
     TARGET_NODE_CASE(Lo)
     TARGET_NODE_CASE(MEMBARRIER)
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(TS1AM)
     TARGET_NODE_CASE(VEC_BROADCAST)

     // Register the VVP_* SDNodes.
 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
 #include "VVPNodes.def"
   }
 #undef TARGET_NODE_CASE
   return nullptr;
 }

 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
   return MVT::i32;
 }

 // Convert to a target node and set target flags.
 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                           SelectionDAG &DAG) const {
   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
                                       GA->getValueType(0), GA->getOffset(), TF);

   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
                                      0, TF);

   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
                                      CP->getAlign(), CP->getOffset(), TF);

   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
                                        TF);

   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);

   llvm_unreachable("Unhandled address SDNode");
 }

 // Split Op into high and low parts according to HiTF and LoTF.
 // Return an ADD node combining the parts.
 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                                        SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
 }

 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
 // or ExternalSymbol SDNode.
 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT PtrVT = Op.getValueType();

   // Handle PIC mode first. VE needs a got load for every variable!
   if (isPositionIndependent()) {
     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);

     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
       // Create following instructions for local linkage PIC code.
       //     lea %reg, label@gotoff_lo
       //     and %reg, %reg, (32)0
       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
     // Create following instructions for not local linkage PIC code.
     //     lea %reg, label@got_lo
     //     and %reg, %reg, (32)0
     //     lea.sl %reg, label@got_hi(%reg)
     //     ld %reg, (%reg, %got)
     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }

   // This is one of the absolute code models.
   switch (getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
   case CodeModel::Small:
   case CodeModel::Medium:
   case CodeModel::Large:
     // abs64.
     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
   }
 }

 /// Custom Lower {

 // The mappings for emitLeading/TrailingFence for VE is designed by following
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
                                                 Instruction *Inst,
                                                 AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/non-atomic");
   case AtomicOrdering::Monotonic:
   case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
   case AtomicOrdering::Release:
   case AtomicOrdering::AcquireRelease:
     return Builder.CreateFence(AtomicOrdering::Release);
   case AtomicOrdering::SequentiallyConsistent:
     if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
   }
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }

 Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                  Instruction *Inst,
                                                  AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/not-atomic");
   case AtomicOrdering::Monotonic:
   case AtomicOrdering::Release:
     return nullptr; // Nothing to do
   case AtomicOrdering::Acquire:
   case AtomicOrdering::AcquireRelease:
     return Builder.CreateFence(AtomicOrdering::Acquire);
   case AtomicOrdering::SequentiallyConsistent:
     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
   }
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
 }

 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

   // VE uses Release consistency, so need a fence instruction if it is a
   // cross-thread fence.
   if (FenceSSID == SyncScope::System) {
     switch (FenceOrdering) {
     case AtomicOrdering::NotAtomic:
     case AtomicOrdering::Unordered:
     case AtomicOrdering::Monotonic:
       // No need to generate fencem instruction here.
       break;
     case AtomicOrdering::Acquire:
       // Generate "fencem 2" as acquire fence.
       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
                                         DAG.getTargetConstant(2, DL, MVT::i32),
                                         Op.getOperand(0)),
                      0);
     case AtomicOrdering::Release:
       // Generate "fencem 1" as release fence.
       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
                                         DAG.getTargetConstant(1, DL, MVT::i32),
                                         Op.getOperand(0)),
                      0);
     case AtomicOrdering::AcquireRelease:
     case AtomicOrdering::SequentiallyConsistent:
       // Generate "fencem 3" as acq_rel and seq_cst fence.
       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
       //        so  seq_cst may require more instruction for them.
       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
                                         DAG.getTargetConstant(3, DL, MVT::i32),
                                         Op.getOperand(0)),
                      0);
     }
   }

   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }

 TargetLowering::AtomicExpansionKind
 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
   if (AI->getOperation() == AtomicRMWInst::Xchg) {
     return AtomicExpansionKind::None;
   }
   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.

   // Otherwise, expand it using compare and exchange instruction to not call
   // __sync_fetch_and_* functions.
   return AtomicExpansionKind::CmpXChg;
 }

 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
                             SDValue &Bits) {
   SDLoc DL(Op);
   AtomicSDNode *N = cast<AtomicSDNode>(Op);
   SDValue Ptr = N->getOperand(1);
   SDValue Val = N->getOperand(2);
   EVT PtrVT = Ptr.getValueType();
   bool Byte = N->getMemoryVT() == MVT::i8;
   //   Remainder = AND Ptr, 3
   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
   //   Bits = Remainder << 3
   //   NewVal = Val << Bits
   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
                       : DAG.getConstant(3, DL, MVT::i32);
   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
 }

 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
                              SDValue Bits) {
   SDLoc DL(Op);
   EVT VT = Data.getValueType();
   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
   //   NewData = Data >> Bits
   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)

   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
   return DAG.getNode(ISD::AND, DL, VT,
                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
 }

 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
   AtomicSDNode *N = cast<AtomicSDNode>(Op);

   if (N->getMemoryVT() == MVT::i8) {
     // For i8, use "ts1am"
     //   Input:
     //     ATOMIC_SWAP Ptr, Val, Order
     //
     //   Output:
     //     Remainder = AND Ptr, 3
     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
     //     Bits = Remainder << 3
     //     NewVal = Val << Bits
     //
     //     Aligned = AND Ptr, -4
     //     Data = TS1AM Aligned, Flag, NewVal
     //
     //     NewData = Data >> Bits
     //     Result = NewData & 0xff ; 1 byte result
     SDValue Flag;
     SDValue Bits;
     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);

     SDValue Ptr = N->getOperand(1);
     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
                                   DAG.getVTList(Op.getNode()->getValueType(0),
                                                 Op.getNode()->getValueType(1)),
                                   {N->getChain(), Aligned, Flag, NewVal},
                                   N->getMemOperand());

     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
     SDValue Chain = TS1AM.getValue(1);
     return DAG.getMergeValues({Result, Chain}, DL);
   }
   if (N->getMemoryVT() == MVT::i16) {
     // For i16, use "ts1am"
     SDValue Flag;
     SDValue Bits;
     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);

     SDValue Ptr = N->getOperand(1);
     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
                                   DAG.getVTList(Op.getNode()->getValueType(0),
                                                 Op.getNode()->getValueType(1)),
                                   {N->getChain(), Aligned, Flag, NewVal},
                                   N->getMemOperand());

     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
     SDValue Chain = TS1AM.getValue(1);
     return DAG.getMergeValues({Result, Chain}, DL);
   }
   // Otherwise, let llvm legalize it.
   return Op;
 }

 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
                                             SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
                                             SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 SDValue
 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);

   // Generate the following code:
   //   t1: ch,glue = callseq_start t0, 0, 0
   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
   SDValue Label = withTargetFlags(Op, 0, DAG);
   EVT PtrVT = Op.getValueType();

   // Lowering the machine isd will make sure everything is in the right
   // location.
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
       DAG.getMachineFunction(), CallingConv::C);
   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
                              DAG.getIntPtrConstant(0, DL, true),
                              Chain.getValue(1), DL);
   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));

   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setHasCalls(true);

   // Also generate code to prepare a GOT register if it is PIC.
   if (isPositionIndependent()) {
     MachineFunction &MF = DAG.getMachineFunction();
     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
   }

   return Chain;
 }

 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The current implementation of nld (2.26) doesn't allow local exec model
   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
   // generate the general dynamic model code sequence.
   //
   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
   return lowerToTLSGeneralDynamicModel(Op, DAG);
 }

 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }

 // Lower a f128 load into two f64 loads.
 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
   unsigned Alignment = LdNode->getAlign().value();
   if (Alignment > 8)
     Alignment = 8;

   SDValue Lo64 =
       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
                   LdNode->getPointerInfo(), Alignment,
                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
                                        : MachineMemOperand::MONone);
   EVT AddrVT = LdNode->getBasePtr().getValueType();
   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
                               DAG.getConstant(8, DL, AddrVT));
   SDValue Hi64 =
       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
                   LdNode->getPointerInfo(), Alignment,
                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
                                        : MachineMemOperand::MONone);

   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);

   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
   SDNode *InFP128 =
       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
                                SDValue(InFP128, 0), Hi64, SubRegEven);
   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
                                SDValue(InFP128, 0), Lo64, SubRegOdd);
   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
                           SDValue(Hi64.getNode(), 1)};
   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
   return DAG.getMergeValues(Ops, DL);
 }

 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());

   SDValue BasePtr = LdNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
     // dependency problems.  We expand it later in eliminateFrameIndex().
     return Op;
   }

   EVT MemVT = LdNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerLoadF128(Op, DAG);

   return Op;
 }

 // Lower a f128 store into two f64 stores.
 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");

   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);

   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
                                     StNode->getValue(), SubRegEven);
   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
                                     StNode->getValue(), SubRegOdd);

   unsigned Alignment = StNode->getAlign().value();
   if (Alignment > 8)
     Alignment = 8;

   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
   SDValue OutChains[2];
   OutChains[0] =
       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
                                         : MachineMemOperand::MONone);
   EVT AddrVT = StNode->getBasePtr().getValueType();
   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
                               DAG.getConstant(8, DL, AddrVT));
   OutChains[1] =
       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
                    MachinePointerInfo(), Alignment,
                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
                                         : MachineMemOperand::MONone);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
 }

 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");

   SDValue BasePtr = StNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
     // dependency problems.  We expand it later in eliminateFrameIndex().
     return Op;
   }

   EVT MemVT = StNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerStoreF128(Op, DAG);

   // Otherwise, ask llvm to expand it.
   return SDValue();
 }

 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());

   // Need frame address to find the address of VarArgsFrameIndex.
   MF.getFrameInfo().setFrameAddressIsTaken(true);

   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc DL(Op);
   SDValue Offset =
       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }

 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
   SDValue VAList =
       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
   SDValue Chain = VAList.getValue(1);
   SDValue NextPtr;

   if (VT == MVT::f128) {
     // VE f128 values must be stored with 16 bytes alignment.  We doesn't
     // know the actual alignment of VAList, so we take alignment of it
     // dyanmically.
     int Align = 16;
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align - 1, DL, PtrVT));
     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
                          DAG.getConstant(-Align, DL, PtrVT));
     // Increment the pointer, VAList, by 16 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
   } else if (VT == MVT::f32) {
     // float --> need special handling like below.
     //    0      4
     //    +------+------+
     //    | empty| float|
     //    +------+------+
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
     // Then, adjust VAList.
     unsigned InternalOffset = 4;
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(InternalOffset, DL, PtrVT));
   } else {
     // Increment the pointer, VAList, by 8 to the next vaarg.
     NextPtr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
   }

   // Store the incremented VAList to the legalized pointer.
   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));

   // Load the actual argument out of the pointer VAList.
   // We can't count on greater alignment than the word size.
   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
 }

 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // Generate following code.
   //   (void)__llvm_grow_stack(size);
   //   ret = GETSTACKTOP;        // pseudo instruction
   SDLoc DL(Op);

   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
   MaybeAlign Alignment(Op.getConstantOperandVal(2));
   EVT VT = Node->getValueType(0);

   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   Align StackAlign = TFI.getStackAlign();
   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;

   // Prepare arguments
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Size;
   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
   Args.push_back(Entry);
   if (NeedsAlign) {
     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
     Args.push_back(Entry);
   }
   Type *RetTy = Type::getVoidTy(*DAG.getContext());

   EVT PtrVT = Op.getValueType();
   SDValue Callee;
   if (NeedsAlign) {
     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
   } else {
     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
   }

   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL)
       .setChain(Chain)
       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
       .setDiscardResult(true);
   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
   Chain = pair.second;
   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
   if (NeedsAlign) {
     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
     Result = DAG.getNode(ISD::AND, DL, VT, Result,
                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
   }
   //  Chain = Result.getValue(1);
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);

   SDValue Ops[2] = {Result, Chain};
   return DAG.getMergeValues(Ops, DL);
 }

 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1));
 }

 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
                      Op.getOperand(1));
 }

 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
                      Op.getOperand(0));
 }

 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
                               const VETargetLowering &TLI,
                               const VESubtarget *Subtarget) {
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());

   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);

   unsigned Depth = Op.getConstantOperandVal(0);
   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(MF);
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
   while (Depth--)
     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
                             FrameAddr, MachinePointerInfo());
   return FrameAddr;
 }

 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
                                const VETargetLowering &TLI,
                                const VESubtarget *Subtarget) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setReturnAddressIsTaken(true);

   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();

   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);

   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Offset = DAG.getConstant(8, DL, VT);
   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                      MachinePointerInfo());
 }

 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc DL(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
   default: // Don't custom lower most intrinsics.
     return SDValue();
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     MVT VT = Op.getSimpleValueType();
     const VETargetMachine *TM =
         static_cast<const VETargetMachine *>(&DAG.getTarget());

     // Create GCC_except_tableXX string.  The real symbol for that will be
     // generated in EHStreamer::emitExceptionTable() later.  So, we just
     // borrow it's name here.
     TM->getStrList()->push_back(std::string(
         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
     SDValue Addr =
         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
     if (isPositionIndependent()) {
       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
     }
     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
   }
   }
 }

 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
   if (!isa<BuildVectorSDNode>(N))
     return false;
   const auto *BVN = cast<BuildVectorSDNode>(N);

   // Find first non-undef insertion.
   unsigned Idx;
   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
     auto ElemV = BVN->getOperand(Idx);
     if (!ElemV->isUndef())
       break;
   }
   // Catch the (hypothetical) all-undef case.
   if (Idx == BVN->getNumOperands())
     return false;
   // Remember insertion.
   UniqueIdx = Idx++;
   // Verify that all other insertions are undef.
   for (; Idx < BVN->getNumOperands(); ++Idx) {
     auto ElemV = BVN->getOperand(Idx);
     if (!ElemV->isUndef())
       return false;
   }
   return true;
 }

 static SDValue getSplatValue(SDNode *N) {
   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
     return BuildVec->getSplatValue();
   }
   return SDValue();
 }

 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
   unsigned NumEls = Op.getValueType().getVectorNumElements();
   MVT ElemVT = Op.getSimpleValueType().getVectorElementType();

   // If there is just one element, expand to INSERT_VECTOR_ELT.
   unsigned UniqueIdx;
   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
     SDValue AccuV = DAG.getUNDEF(Op.getValueType());
     auto ElemV = Op->getOperand(UniqueIdx);
     SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
     return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
                        ElemV, IdxV);
   }

   // Else emit a broadcast.
   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
     // lower to VEC_BROADCAST
     MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);

     auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
     return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
                        AVL);
   }

   // Expand
   return SDValue();
 }

 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opcode = Op.getOpcode();
   if (ISD::isVPOpcode(Opcode))
     return lowerToVVP(Op, DAG);

   switch (Opcode) {
   default:
     llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:
     return lowerATOMIC_FENCE(Op, DAG);
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_SWAP(Op, DAG);
   case ISD::BlockAddress:
     return lowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:
     return lowerConstantPool(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:
     return lowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:
     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_SETUP_DISPATCH:
     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::FRAMEADDR:
     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
   case ISD::GlobalAddress:
     return lowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return lowerGlobalTLSAddress(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::JumpTable:
     return lowerJumpTable(Op, DAG);
   case ISD::LOAD:
     return lowerLOAD(Op, DAG);
   case ISD::RETURNADDR:
     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG);
   case ISD::STORE:
     return lowerSTORE(Op, DAG);
   case ISD::VASTART:
     return lowerVASTART(Op, DAG);
   case ISD::VAARG:
     return lowerVAARG(Op, DAG);

   case ISD::INSERT_VECTOR_ELT:
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);

 #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
     return lowerToVVP(Op, DAG);
   }
 }
 /// } Custom Lower

 void VETargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
   case ISD::ATOMIC_SWAP:
     // Let LLVM expand atomic swap instruction through LowerOperation.
     return;
   default:
     LLVM_DEBUG(N->dumpr(&DAG));
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   }
 }

 /// JumpTable for VE.
 ///
 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
 ///   generate expressions using symbols in both text segment and data
 ///   segment like below.
 ///             .4byte  .LBB0_2-.LJTI0_0
 ///   So, we generate offset from the top of function like below as
 ///   a custom label.
 ///             .4byte  .LBB0_2-<function name>

 unsigned VETargetLowering::getJumpTableEncoding() const {
   // Use custom label for PIC.
   if (isPositionIndependent())
     return MachineJumpTableInfo::EK_Custom32;

   // Otherwise, use the normal jump table encoding heuristics.
   return TargetLowering::getJumpTableEncoding();
 }

 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
     unsigned Uid, MCContext &Ctx) const {
   assert(isPositionIndependent());

   // Generate custom label for PIC like below.
   //    .4bytes  .LBB0_2-<function name>
   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
   return MCBinaryExpr::createSub(Value, Base, Ctx);
 }

 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                    SelectionDAG &DAG) const {
   assert(isPositionIndependent());
   SDLoc DL(Table);
   Function *Function = &DAG.getMachineFunction().getFunction();
   assert(Function != nullptr);
   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());

   // In the jump table, we have following values in PIC mode.
   //    .4bytes  .LBB0_2-<function name>
   // We need to add this value and the address of this function to generate
   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
   // instructions:
   //     lea %reg, fun@gotoff_lo
   //     and %reg, %reg, (32)0
   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
   // In order to do so, we need to genarate correctly marked DAG node using
   // makeHiLoPair.
   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
 }

 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       MachineBasicBlock *TargetBB,
                                       const DebugLoc &DL) const {
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const VEInstrInfo *TII = Subtarget->getInstrInfo();

   const TargetRegisterClass *RC = &VE::I64RegClass;
   Register Tmp1 = MRI.createVirtualRegister(RC);
   Register Tmp2 = MRI.createVirtualRegister(RC);
   Register Result = MRI.createVirtualRegister(RC);

   if (isPositionIndependent()) {
     // Create following instructions for local linkage PIC code.
     //     lea %Tmp1, TargetBB@gotoff_lo
     //     and %Tmp2, %Tmp1, (32)0
     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, TargetBB@lo
     //     and     %Tmp2, %Tmp1, (32)0
     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
   }
   return Result;
 }

 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          StringRef Symbol, const DebugLoc &DL,
                                          bool IsLocal = false,
                                          bool IsCall = false) const {
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const VEInstrInfo *TII = Subtarget->getInstrInfo();

   const TargetRegisterClass *RC = &VE::I64RegClass;
   Register Result = MRI.createVirtualRegister(RC);

   if (isPositionIndependent()) {
     if (IsCall && !IsLocal) {
       // Create following instructions for non-local linkage PIC code function
       // calls.  These instructions uses IC and magic number -24, so we expand
       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
       //     lea %Reg, Symbol@plt_lo(-24)
       //     and %Reg, %Reg, (32)0
       //     sic %s16
       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
           .addExternalSymbol("abort");
     } else if (IsLocal) {
       Register Tmp1 = MRI.createVirtualRegister(RC);
       Register Tmp2 = MRI.createVirtualRegister(RC);
       // Create following instructions for local linkage PIC code.
       //     lea %Tmp1, Symbol@gotoff_lo
       //     and %Tmp2, %Tmp1, (32)0
       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
     } else {
       Register Tmp1 = MRI.createVirtualRegister(RC);
       Register Tmp2 = MRI.createVirtualRegister(RC);
       // Create following instructions for not local linkage PIC code.
       //     lea %Tmp1, Symbol@got_lo
       //     and %Tmp2, %Tmp1, (32)0
       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
       //     ld %Result, 0(%Tmp3)
       Register Tmp3 = MRI.createVirtualRegister(RC);
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
           .addReg(Tmp3, getKillRegState(true))
           .addImm(0)
           .addImm(0);
     }
   } else {
     Register Tmp1 = MRI.createVirtualRegister(RC);
     Register Tmp2 = MRI.createVirtualRegister(RC);
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, Symbol@lo
     //     and     %Tmp2, %Tmp1, (32)0
     //     lea.sl  %Result, Symbol@hi(%Tmp2)
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
   }
   return Result;
 }

 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
                                               MachineBasicBlock *MBB,
                                               MachineBasicBlock *DispatchBB,
                                               int FI, int Offset) const {
   DebugLoc DL = MI.getDebugLoc();
   const VEInstrInfo *TII = Subtarget->getInstrInfo();

   Register LabelReg =
       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);

   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
   // referenced by longjmp (throw) later.
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
   MIB.addReg(LabelReg, getKillRegState(true));
 }

 MachineBasicBlock *
 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                    MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();

   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();

   // Memory Reference.
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
   Register BufReg = MI.getOperand(1).getReg();

   Register DstReg;

   DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   (void)TRI;
   Register MainDestReg = MRI.createVirtualRegister(RC);
   Register RestoreDestReg = MRI.createVirtualRegister(RC);

   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
   //
   // ThisMBB:
   //   buf[3] = %s17 iff %s17 is used as BP
   //   buf[1] = RestoreMBB as IC after longjmp
   //   # SjLjSetup RestoreMBB
   //
   // MainMBB:
   //   v_main = 0
   //
   // SinkMBB:
   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
   //   ...
   //
   // RestoreMBB:
   //   %s17 = buf[3] = iff %s17 is used as BP
   //   v_restore = 1
   //   goto SinkMBB

   MachineBasicBlock *ThisMBB = MBB;
   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, MainMBB);
   MF->insert(I, SinkMBB);
   MF->push_back(RestoreMBB);
   RestoreMBB->setHasAddressTaken();

   // Transfer the remainder of BB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

   // ThisMBB:
   Register LabelReg =
       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);

   // Store BP in buf[3] iff this function is using BP.
   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
   if (TFI->hasBP(*MF)) {
     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
     MIB.addReg(BufReg);
     MIB.addImm(0);
     MIB.addImm(24);
     MIB.addReg(VE::SX17);
     MIB.setMemRefs(MMOs);
   }

   // Store IP in buf[1].
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
   MIB.addImm(0);
   MIB.addImm(8);
   MIB.addReg(LabelReg, getKillRegState(true));
   MIB.setMemRefs(MMOs);

   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.

   // Insert setup.
   MIB =
       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);

   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   ThisMBB->addSuccessor(MainMBB);
   ThisMBB->addSuccessor(RestoreMBB);

   // MainMBB:
   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
       .addImm(0)
       .addImm(0)
       .addImm(0);
   MainMBB->addSuccessor(SinkMBB);

   // SinkMBB:
   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
       .addReg(MainDestReg)
       .addMBB(MainMBB)
       .addReg(RestoreDestReg)
       .addMBB(RestoreMBB);

   // RestoreMBB:
   // Restore BP from buf[3] iff this function is using BP.  The address of
   // buf is in SX10.
   // FIXME: Better to not use SX10 here
   if (TFI->hasBP(*MF)) {
     MachineInstrBuilder MIB =
         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
     MIB.addReg(VE::SX10);
     MIB.addImm(0);
     MIB.addImm(24);
     MIB.setMemRefs(MMOs);
   }
   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
       .addImm(0)
       .addImm(0)
       .addImm(1);
   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
   RestoreMBB->addSuccessor(SinkMBB);

   MI.eraseFromParent();
   return SinkMBB;
 }

 MachineBasicBlock *
 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();

   // Memory Reference.
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
   Register BufReg = MI.getOperand(0).getReg();

   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   Register FP = VE::SX9;
   Register SP = VE::SX11;

   MachineInstrBuilder MIB;

   MachineBasicBlock *ThisMBB = MBB;

   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
   //
   // ThisMBB:
   //   %fp = load buf[0]
   //   %jmp = load buf[1]
   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
   //   jmp %jmp

   // Reload FP.
   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
   MIB.addReg(BufReg);
   MIB.addImm(0);
   MIB.addImm(0);
   MIB.setMemRefs(MMOs);

   // Reload IP.
   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
   MIB.addReg(BufReg);
   MIB.addImm(0);
   MIB.addImm(8);
   MIB.setMemRefs(MMOs);

   // Copy BufReg to SX10 for later use in setjmp.
   // FIXME: Better to not use SX10 here
   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
       .addReg(BufReg)
       .addImm(0);

   // Reload SP.
   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
   MIB.addImm(0);
   MIB.addImm(16);
   MIB.setMemRefs(MMOs);

   // Jump.
   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
       .addReg(Tmp, getKillRegState(true))
       .addImm(0);

   MI.eraseFromParent();
   return ThisMBB;
 }

 MachineBasicBlock *
 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const VEInstrInfo *TII = Subtarget->getInstrInfo();
   int FI = MFI.getFunctionContextIndex();

   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (auto &MBB : *MF) {
     if (!MBB.isEHPad())
       continue;

     MCSymbol *Sym = nullptr;
     for (const auto &MI : MBB) {
       if (MI.isDebugInstr())
         continue;

       assert(MI.isEHLabel() && "expected EH_LABEL");
       Sym = MI.getOperand(0).getMCSymbol();
       break;
     }

     if (!MF->hasCallSiteLandingPad(Sym))
       continue;

     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
       CallSiteNumToLPad[CSI].push_back(&MBB);
       MaxCSNum = std::max(MaxCSNum, CSI);
     }
   }

   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock *> LPadList;
   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());

   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
     for (auto &LP : CallSiteNumToLPad[CSI]) {
       LPadList.push_back(LP);
       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
     }
   }

   assert(!LPadList.empty() &&
          "No landing pad destinations for the dispatch jump table!");

   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
   //
   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
   // First `i64` is callsite, so callsite is FI+8.
   static const int OffsetIC = 72;
   static const int OffsetCS = 8;

   // Create the MBBs for the dispatch code like following:
   //
   // ThisMBB:
   //   Prepare DispatchBB address and store it to buf[1].
   //   ...
   //
   // DispatchBB:
   //   %s15 = GETGOT iff isPositionIndependent
   //   %callsite = load callsite
   //   brgt.l.t #size of callsites, %callsite, DispContBB
   //
   // TrapBB:
   //   Call abort.
   //
   // DispContBB:
   //   %breg = address of jump table
   //   %pc = load and calculate next pc from %breg and %callsite
   //   jmp %pc

   // Shove the dispatch's address into the return slot in the function context.
   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   DispatchBB->setIsEHPad(true);

   // Trap BB will causes trap like `assert(0)`.
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   DispatchBB->addSuccessor(TrapBB);

   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   DispatchBB->addSuccessor(DispContBB);

   // Insert MBBs.
   MF->push_back(DispatchBB);
   MF->push_back(DispContBB);
   MF->push_back(TrapBB);

   // Insert code to call abort in the TrapBB.
   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
                                  /* Local */ false, /* Call */ true);
   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
       .addReg(Abort, getKillRegState(true))
       .addImm(0)
       .addImm(0);

   // Insert code into the entry block that creates and registers the function
   // context.
   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);

   // Create the jump table and associated information
   unsigned JTE = getJumpTableEncoding();
   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);

   const VERegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
       .addRegMask(RI.getNoPreservedMask());

   if (isPositionIndependent()) {
     // Force to generate GETGOT, since current implementation doesn't store GOT
     // register.
     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
   }

   // IReg is used as an index in a memory operand and therefore can't be SP
   const TargetRegisterClass *RC = &VE::I64RegClass;
   Register IReg = MRI.createVirtualRegister(RC);
   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
                     OffsetCS);
   if (LPadList.size() < 64) {
     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
         .addImm(VECC::CC_ILE)
         .addImm(LPadList.size())
         .addReg(IReg)
         .addMBB(TrapBB);
   } else {
     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
     Register TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
         .addImm(0)
         .addImm(0)
         .addImm(LPadList.size());
     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
         .addImm(VECC::CC_ILE)
         .addReg(TmpReg, getKillRegState(true))
         .addReg(IReg)
         .addMBB(TrapBB);
   }

   Register BReg = MRI.createVirtualRegister(RC);
   Register Tmp1 = MRI.createVirtualRegister(RC);
   Register Tmp2 = MRI.createVirtualRegister(RC);

   if (isPositionIndependent()) {
     // Create following instructions for local linkage PIC code.
     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
     //     and    %Tmp2, %Tmp1, (32)0
     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, .LJTI0_0@lo
     //     and     %Tmp2, %Tmp1, (32)0
     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
   }

   switch (JTE) {
   case MachineJumpTableInfo::EK_BlockAddress: {
     // Generate simple block address code for no-PIC model.
     //     sll %Tmp1, %IReg, 3
     //     lds %TReg, 0(%Tmp1, %BReg)
     //     bcfla %TReg

     Register TReg = MRI.createVirtualRegister(RC);
     Register Tmp1 = MRI.createVirtualRegister(RC);

     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
         .addReg(IReg, getKillRegState(true))
         .addImm(3);
     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
         .addReg(BReg, getKillRegState(true))
         .addReg(Tmp1, getKillRegState(true))
         .addImm(0);
     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
         .addReg(TReg, getKillRegState(true))
         .addImm(0);
     break;
   }
   case MachineJumpTableInfo::EK_Custom32: {
     // Generate block address code using differences from the function pointer
     // for PIC model.
     //     sll %Tmp1, %IReg, 2
     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
     //     Prepare function address in BReg2.
     //     adds.l %TReg, %BReg2, %OReg
     //     bcfla %TReg

     assert(isPositionIndependent());
     Register OReg = MRI.createVirtualRegister(RC);
     Register TReg = MRI.createVirtualRegister(RC);
     Register Tmp1 = MRI.createVirtualRegister(RC);

     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
         .addReg(IReg, getKillRegState(true))
         .addImm(2);
     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
         .addReg(BReg, getKillRegState(true))
         .addReg(Tmp1, getKillRegState(true))
         .addImm(0);
     Register BReg2 =
         prepareSymbol(*DispContBB, DispContBB->end(),
                       DispContBB->getParent()->getName(), DL, /* Local */ true);
     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
         .addReg(OReg, getKillRegState(true))
         .addReg(BReg2, getKillRegState(true));
     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
         .addReg(TReg, getKillRegState(true))
         .addImm(0);
     break;
   }
   default:
     llvm_unreachable("Unexpected jump table encoding");
   }

   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   for (auto &LP : LPadList)
     if (SeenMBBs.insert(LP).second)
       DispContBB->addSuccessor(LP);

   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
     // Keep a copy of Successors since it's modified inside the loop.
     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
                                                    MBB->succ_rend());
     // FIXME: Avoid quadratic complexity.
     for (auto MBBS : Successors) {
       if (MBBS->isEHPad()) {
         MBB->removeSuccessor(MBBS);
         MBBLPads.push_back(MBBS);
       }
     }

     MBB->addSuccessor(DispatchBB);

     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled.  This prevents code from
     // moving instructions to before the EH block, where they will never be
     // executed.
     for (auto &II : reverse(*MBB)) {
       if (!II.isCall())
         continue;

       DenseMap<Register, bool> DefRegs;
       for (auto &MOp : II.operands())
         if (MOp.isReg())
           DefRegs[MOp.getReg()] = true;

       MachineInstrBuilder MIB(*MF, &II);
       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
         Register Reg = SavedRegs[RI];
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
       }

       break;
     }
   }

   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   // landing pad now.
   for (auto &LP : MBBLPads)
     LP->setIsEHPad(false);

   // The instruction is gone now.
   MI.eraseFromParent();
   return BB;
 }

 MachineBasicBlock *
 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                               MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unknown Custom Instruction!");
   case VE::EH_SjLj_LongJmp:
     return emitEHSjLjLongJmp(MI, BB);
   case VE::EH_SjLj_SetJmp:
     return emitEHSjLjSetJmp(MI, BB);
   case VE::EH_SjLj_Setup_Dispatch:
     return emitSjLjDispatchBlock(MI, BB);
   }
 }

 static bool isI32Insn(const SDNode *User, const SDNode *N) {
   switch (User->getOpcode()) {
   default:
     return false;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SETCC:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::SHL:
   case ISD::SRA:
   case ISD::BSWAP:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::BR_CC:
   case ISD::BITCAST:
   case ISD::ATOMIC_CMP_SWAP:
   case ISD::ATOMIC_SWAP:
     return true;
   case ISD::SRL:
     if (N->getOperand(0).getOpcode() != ISD::SRL)
       return true;
     // (srl (trunc (srl ...))) may be optimized by combining srl, so
     // doesn't optimize trunc now.
     return false;
   case ISD::SELECT_CC:
     if (User->getOperand(2).getNode() != N &&
         User->getOperand(3).getNode() != N)
       return true;
     LLVM_FALLTHROUGH;
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::SELECT:
   case ISD::CopyToReg:
     // Check all use of selections, bit operations, and copies.  If all of them
     // are safe, optimize truncate to extract_subreg.
     for (const SDNode *U : User->uses()) {
       switch (U->getOpcode()) {
       default:
         // If the use is an instruction which treats the source operand as i32,
         // it is safe to avoid truncate here.
         if (isI32Insn(U, N))
           continue;
         break;
       case ISD::ANY_EXTEND:
       case ISD::SIGN_EXTEND:
       case ISD::ZERO_EXTEND: {
         // Special optimizations to the combination of ext and trunc.
         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
         // since this truncate instruction clears higher 32 bits which is filled
         // by one of ext instructions later.
         assert(N->getValueType(0) == MVT::i32 &&
                "find truncate to not i32 integer");
         if (User->getOpcode() == ISD::SELECT_CC ||
             User->getOpcode() == ISD::SELECT)
           continue;
         break;
       }
       }
       return false;
     }
     return true;
   }
 }

 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
 // is sometime too late.  So, doing it at here.
 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
                                           DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::TRUNCATE &&
          "Should be called with a TRUNCATE node");

   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   EVT VT = N->getValueType(0);

   // We prefer to do this when all types are legal.
   if (!DCI.isAfterLegalizeDAG())
     return SDValue();

   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
     return SDValue();

   // Check all use of this TRUNCATE.
   for (const SDNode *User : N->uses()) {
     // Make sure that we're not going to replace TRUNCATE for non i32
     // instructions.
     //
     // FIXME: Although we could sometimes handle this, and it does occur in
     // practice that one of the condition inputs to the select is also one of
     // the outputs, we currently can't deal with this.
     if (isI32Insn(User, N))
       continue;

     return SDValue();
   }

   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
                                     N->getOperand(0), SubI32),
                  0);
 }

 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default:
     break;
   case ISD::TRUNCATE:
     return combineTRUNCATE(N, DCI);
   }

   return SDValue();
 }

 //===----------------------------------------------------------------------===//
 // VE Inline Assembly Support
 //===----------------------------------------------------------------------===//

 VETargetLowering::ConstraintType
 VETargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
       break;
     case 'v': // vector registers
       return C_RegisterClass;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }

 std::pair<unsigned, const TargetRegisterClass *>
 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
   const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     case 'r':
       RC = &VE::I64RegClass;
       break;
     case 'v':
       RC = &VE::V64RegClass;
       break;
     }
     return std::make_pair(0U, RC);
   }

   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }

 //===----------------------------------------------------------------------===//
 // VE Target Optimization Support
 //===----------------------------------------------------------------------===//

 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
   if (isJumpTableRelative())
     return 8;

   return TargetLowering::getMinimumJumpTableEntries();
 }

 bool VETargetLowering::hasAndNot(SDValue Y) const {
   EVT VT = Y.getValueType();

   // VE doesn't have vector and not instruction.
   if (VT.isVector())
     return false;

   // VE allows different immediate values for X and Y where ~X & Y.
   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
   // function is used to check whether an immediate value is OK for and-not
   // instruction as both X and Y.  Generating additional instruction to
   // retrieve an immediate value is no good since the purpose of this
   // function is to convert a series of 3 instructions to another series of
   // 3 instructions with better parallelism.  Therefore, we return false
   // for all immediate values now.
   // FIXME: Change hasAndNot function to have two operands to make it work
   //        correctly with Aurora VE.
   if (isa<ConstantSDNode>(Y))
     return false;

   // It's ok for generic registers.
   return true;
 }

 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
 static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   switch (Opcode) {
 #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
   case ISD::VPOPC:                                                             \
     return VEISD::VVPNAME;
 #define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
   case VEISD::VVPNAME:                                                         \
   case ISD::SDNAME:                                                            \
     return VEISD::VVPNAME;
 #include "VVPNodes.def"
   }
   return None;
 }

 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   // Can we represent this as a VVP node.
   const unsigned Opcode = Op->getOpcode();
   auto VVPOpcodeOpt = getVVPOpcode(Opcode);
   if (!VVPOpcodeOpt.hasValue())
     return SDValue();
   unsigned VVPOpcode = VVPOpcodeOpt.getValue();
   const bool FromVP = ISD::isVPOpcode(Opcode);

   // The representative and legalized vector type of this operation.
   SDLoc DL(Op);
   MVT MaskVT = MVT::v256i1; // TODO: packed mode.
   EVT OpVecVT = Op.getValueType();
   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);

   SDValue AVL;
   SDValue Mask;

   if (FromVP) {
     // All upstream VP SDNodes always have a mask and avl.
     auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
     auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
     Mask = Op->getOperand(MaskIdx);
     AVL = Op->getOperand(AVLIdx);

   } else {
     // Materialize the VL parameter.
     AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
     SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
     Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
                        ConstTrue); // emit a VEISD::VEC_BROADCAST here.
   }

   // Categories we are interested in.
   bool IsBinaryOp = false;

   switch (VVPOpcode) {
 #define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
   case VEISD::VVPNAME:                                                         \
     IsBinaryOp = true;                                                         \
     break;
 #include "VVPNodes.def"
   }

   if (IsBinaryOp) {
     assert(LegalVecVT.isSimple());
     return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
                        Op->getOperand(1), Mask, AVL);
   }
   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
 }

 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
   MVT VT = Op.getOperand(0).getSimpleValueType();

   // Special treatment for packed V64 types.
   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
   (void)VT;
   // Example of codes:
   //   %packed_v = extractelt %vr, %idx / 2
   //   %v = %packed_v >> (%idx % 2 * 32)
   //   %res = %v & 0xffffffff

   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
   SDLoc DL(Op);
   SDValue Result = Op;
   if (0 /* Idx->isConstant() */) {
     // TODO: optimized implementation using constant values
   } else {
     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
     SDValue PackedElt =
         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                         MVT::i32, PackedElt, SubI32),
                      0);

     if (Op.getSimpleValueType() == MVT::f32) {
       Result = DAG.getBitcast(MVT::f32, Result);
     } else {
       assert(Op.getSimpleValueType() == MVT::i32);
     }
   }
   return Result;
 }

 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
   MVT VT = Op.getOperand(0).getSimpleValueType();

   // Special treatment for packed V64 types.
   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
   (void)VT;
   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
   // bits" required `val << 32` from C implementation's point of view.
   //
   // Example of codes:
   //   %packed_elt = extractelt %vr, (%idx >> 1)
   //   %shift = ((%idx & 1) ^ 1) << 5
   //   %packed_elt &= 0xffffffff00000000 >> shift
   //   %packed_elt |= (zext %val) << shift
   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)

   SDLoc DL(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue Val = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
   if (Idx.getSimpleValueType() == MVT::i32)
     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
   if (Val.getSimpleValueType() == MVT::f32)
     Val = DAG.getBitcast(MVT::i32, Val);
   assert(Val.getSimpleValueType() == MVT::i32);
   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);

   SDValue Result = Op;
   if (0 /* Idx->isConstant()*/) {
     // TODO: optimized implementation using constant values
   } else {
     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
     SDValue PackedElt =
         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
     Result =
         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
                                    {HalfIdx, PackedElt, Vec}),
                 0);
   }
   return Result;
 }