| //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the AArch64TargetLowering class. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AArch64ISelLowering.h" |
| #include "AArch64CallingConvention.h" |
| #include "AArch64ExpandImm.h" |
| #include "AArch64MachineFunctionInfo.h" |
| #include "AArch64PerfectShuffle.h" |
| #include "AArch64RegisterInfo.h" |
| #include "AArch64Subtarget.h" |
| #include "MCTargetDesc/AArch64AddressingModes.h" |
| #include "Utils/AArch64BaseInfo.h" |
| #include "llvm/ADT/APFloat.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/MemoryLocation.h" |
| #include "llvm/Analysis/ObjCARCUtil.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/CodeGen/Analysis.h" |
| #include "llvm/CodeGen/CallingConvLower.h" |
| #include "llvm/CodeGen/ISDOpcodes.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineInstr.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineMemOperand.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/MachineValueType.h" |
| #include "llvm/CodeGen/RuntimeLibcalls.h" |
| #include "llvm/CodeGen/SelectionDAG.h" |
| #include "llvm/CodeGen/SelectionDAGNodes.h" |
| #include "llvm/CodeGen/TargetCallingConv.h" |
| #include "llvm/CodeGen/TargetInstrInfo.h" |
| #include "llvm/CodeGen/TargetOpcodes.h" |
| #include "llvm/CodeGen/ValueTypes.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/GetElementPtrTypeIterator.h" |
| #include "llvm/IR/GlobalValue.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/IntrinsicsAArch64.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/MC/MCRegisterInfo.h" |
| #include "llvm/Support/AtomicOrdering.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CodeGen.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/InstructionCost.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include "llvm/TargetParser/Triple.h" |
| #include <algorithm> |
| #include <bitset> |
| #include <cassert> |
| #include <cctype> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <iterator> |
| #include <limits> |
| #include <optional> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| using namespace llvm; |
| using namespace llvm::PatternMatch; |
| |
| #define DEBUG_TYPE "aarch64-lower" |
| |
| STATISTIC(NumTailCalls, "Number of tail calls"); |
| STATISTIC(NumShiftInserts, "Number of vector shift inserts"); |
| STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); |
| |
| // FIXME: The necessary dtprel relocations don't seem to be supported |
| // well in the GNU bfd and gold linkers at the moment. Therefore, by |
| // default, for now, fall back to GeneralDynamic code generation. |
| cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( |
| "aarch64-elf-ldtls-generation", cl::Hidden, |
| cl::desc("Allow AArch64 Local Dynamic TLS code generation"), |
| cl::init(false)); |
| |
| static cl::opt<bool> |
| EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, |
| cl::desc("Enable AArch64 logical imm instruction " |
| "optimization"), |
| cl::init(true)); |
| |
| // Temporary option added for the purpose of testing functionality added |
| // to DAGCombiner.cpp in D92230. It is expected that this can be removed |
| // in future when both implementations will be based off MGATHER rather |
| // than the GLD1 nodes added for the SVE gather load intrinsics. |
| static cl::opt<bool> |
| EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, |
| cl::desc("Combine extends of AArch64 masked " |
| "gather intrinsics"), |
| cl::init(true)); |
| |
| static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, |
| cl::desc("Combine ext and trunc to TBL"), |
| cl::init(true)); |
| |
| // All of the XOR, OR and CMP use ALU ports, and data dependency will become the |
| // bottleneck after this transform on high end CPU. So this max leaf node |
| // limitation is guard cmp+ccmp will be profitable. |
| static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, |
| cl::desc("Maximum of xors")); |
| |
| /// Value type used for condition codes. |
| static const MVT MVT_CC = MVT::i32; |
| |
| static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, |
| AArch64::X3, AArch64::X4, AArch64::X5, |
| AArch64::X6, AArch64::X7}; |
| static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, |
| AArch64::Q3, AArch64::Q4, AArch64::Q5, |
| AArch64::Q6, AArch64::Q7}; |
| |
| ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; } |
| |
| ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; } |
| |
| static inline EVT getPackedSVEVectorVT(EVT VT) { |
| switch (VT.getSimpleVT().SimpleTy) { |
| default: |
| llvm_unreachable("unexpected element type for vector"); |
| case MVT::i8: |
| return MVT::nxv16i8; |
| case MVT::i16: |
| return MVT::nxv8i16; |
| case MVT::i32: |
| return MVT::nxv4i32; |
| case MVT::i64: |
| return MVT::nxv2i64; |
| case MVT::f16: |
| return MVT::nxv8f16; |
| case MVT::f32: |
| return MVT::nxv4f32; |
| case MVT::f64: |
| return MVT::nxv2f64; |
| case MVT::bf16: |
| return MVT::nxv8bf16; |
| } |
| } |
| |
| // NOTE: Currently there's only a need to return integer vector types. If this |
| // changes then just add an extra "type" parameter. |
| static inline EVT getPackedSVEVectorVT(ElementCount EC) { |
| switch (EC.getKnownMinValue()) { |
| default: |
| llvm_unreachable("unexpected element count for vector"); |
| case 16: |
| return MVT::nxv16i8; |
| case 8: |
| return MVT::nxv8i16; |
| case 4: |
| return MVT::nxv4i32; |
| case 2: |
| return MVT::nxv2i64; |
| } |
| } |
| |
| static inline EVT getPromotedVTForPredicate(EVT VT) { |
| assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && |
| "Expected scalable predicate vector type!"); |
| switch (VT.getVectorMinNumElements()) { |
| default: |
| llvm_unreachable("unexpected element count for vector"); |
| case 2: |
| return MVT::nxv2i64; |
| case 4: |
| return MVT::nxv4i32; |
| case 8: |
| return MVT::nxv8i16; |
| case 16: |
| return MVT::nxv16i8; |
| } |
| } |
| |
| /// Returns true if VT's elements occupy the lowest bit positions of its |
| /// associated register class without any intervening space. |
| /// |
| /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the |
| /// same register class, but only nxv8f16 can be treated as a packed vector. |
| static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { |
| assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && |
| "Expected legal vector type!"); |
| return VT.isFixedLengthVector() || |
| VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock; |
| } |
| |
| // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading |
| // predicate and end with a passthru value matching the result type. |
| static bool isMergePassthruOpcode(unsigned Opc) { |
| switch (Opc) { |
| default: |
| return false; |
| case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: |
| case AArch64ISD::BSWAP_MERGE_PASSTHRU: |
| case AArch64ISD::REVH_MERGE_PASSTHRU: |
| case AArch64ISD::REVW_MERGE_PASSTHRU: |
| case AArch64ISD::REVD_MERGE_PASSTHRU: |
| case AArch64ISD::CTLZ_MERGE_PASSTHRU: |
| case AArch64ISD::CTPOP_MERGE_PASSTHRU: |
| case AArch64ISD::DUP_MERGE_PASSTHRU: |
| case AArch64ISD::ABS_MERGE_PASSTHRU: |
| case AArch64ISD::NEG_MERGE_PASSTHRU: |
| case AArch64ISD::FNEG_MERGE_PASSTHRU: |
| case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: |
| case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: |
| case AArch64ISD::FCEIL_MERGE_PASSTHRU: |
| case AArch64ISD::FFLOOR_MERGE_PASSTHRU: |
| case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: |
| case AArch64ISD::FRINT_MERGE_PASSTHRU: |
| case AArch64ISD::FROUND_MERGE_PASSTHRU: |
| case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: |
| case AArch64ISD::FTRUNC_MERGE_PASSTHRU: |
| case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: |
| case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: |
| case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: |
| case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: |
| case AArch64ISD::FCVTZU_MERGE_PASSTHRU: |
| case AArch64ISD::FCVTZS_MERGE_PASSTHRU: |
| case AArch64ISD::FSQRT_MERGE_PASSTHRU: |
| case AArch64ISD::FRECPX_MERGE_PASSTHRU: |
| case AArch64ISD::FABS_MERGE_PASSTHRU: |
| return true; |
| } |
| } |
| |
| // Returns true if inactive lanes are known to be zeroed by construction. |
| static bool isZeroingInactiveLanes(SDValue Op) { |
| switch (Op.getOpcode()) { |
| default: |
| // We guarantee i1 splat_vectors to zero the other lanes by |
| // implementing it with ptrue and possibly a punpklo for nxv1i1. |
| if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) |
| return true; |
| return false; |
| case AArch64ISD::PTRUE: |
| case AArch64ISD::SETCC_MERGE_ZERO: |
| return true; |
| case ISD::INTRINSIC_WO_CHAIN: |
| switch (Op.getConstantOperandVal(0)) { |
| default: |
| return false; |
| case Intrinsic::aarch64_sve_ptrue: |
| case Intrinsic::aarch64_sve_pnext: |
| case Intrinsic::aarch64_sve_cmpeq: |
| case Intrinsic::aarch64_sve_cmpne: |
| case Intrinsic::aarch64_sve_cmpge: |
| case Intrinsic::aarch64_sve_cmpgt: |
| case Intrinsic::aarch64_sve_cmphs: |
| case Intrinsic::aarch64_sve_cmphi: |
| case Intrinsic::aarch64_sve_cmpeq_wide: |
| case Intrinsic::aarch64_sve_cmpne_wide: |
| case Intrinsic::aarch64_sve_cmpge_wide: |
| case Intrinsic::aarch64_sve_cmpgt_wide: |
| case Intrinsic::aarch64_sve_cmplt_wide: |
| case Intrinsic::aarch64_sve_cmple_wide: |
| case Intrinsic::aarch64_sve_cmphs_wide: |
| case Intrinsic::aarch64_sve_cmphi_wide: |
| case Intrinsic::aarch64_sve_cmplo_wide: |
| case Intrinsic::aarch64_sve_cmpls_wide: |
| case Intrinsic::aarch64_sve_fcmpeq: |
| case Intrinsic::aarch64_sve_fcmpne: |
| case Intrinsic::aarch64_sve_fcmpge: |
| case Intrinsic::aarch64_sve_fcmpgt: |
| case Intrinsic::aarch64_sve_fcmpuo: |
| case Intrinsic::aarch64_sve_facgt: |
| case Intrinsic::aarch64_sve_facge: |
| case Intrinsic::aarch64_sve_whilege: |
| case Intrinsic::aarch64_sve_whilegt: |
| case Intrinsic::aarch64_sve_whilehi: |
| case Intrinsic::aarch64_sve_whilehs: |
| case Intrinsic::aarch64_sve_whilele: |
| case Intrinsic::aarch64_sve_whilelo: |
| case Intrinsic::aarch64_sve_whilels: |
| case Intrinsic::aarch64_sve_whilelt: |
| case Intrinsic::aarch64_sve_match: |
| case Intrinsic::aarch64_sve_nmatch: |
| case Intrinsic::aarch64_sve_whilege_x2: |
| case Intrinsic::aarch64_sve_whilegt_x2: |
| case Intrinsic::aarch64_sve_whilehi_x2: |
| case Intrinsic::aarch64_sve_whilehs_x2: |
| case Intrinsic::aarch64_sve_whilele_x2: |
| case Intrinsic::aarch64_sve_whilelo_x2: |
| case Intrinsic::aarch64_sve_whilels_x2: |
| case Intrinsic::aarch64_sve_whilelt_x2: |
| return true; |
| } |
| } |
| } |
| |
| AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, |
| const AArch64Subtarget &STI) |
| : TargetLowering(TM), Subtarget(&STI) { |
| // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so |
| // we have to make something up. Arbitrarily, choose ZeroOrOne. |
| setBooleanContents(ZeroOrOneBooleanContent); |
| // When comparing vectors the result sets the different elements in the |
| // vector to all-one or all-zero. |
| setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
| |
| // Set up the register classes. |
| addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); |
| addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); |
| |
| if (Subtarget->hasLS64()) { |
| addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); |
| setOperationAction(ISD::LOAD, MVT::i64x8, Custom); |
| setOperationAction(ISD::STORE, MVT::i64x8, Custom); |
| } |
| |
| if (Subtarget->hasFPARMv8()) { |
| addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); |
| addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); |
| addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); |
| addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); |
| addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); |
| } |
| |
| if (Subtarget->hasNEON()) { |
| addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); |
| addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); |
| // Someone set us up the NEON. |
| addDRTypeForNEON(MVT::v2f32); |
| addDRTypeForNEON(MVT::v8i8); |
| addDRTypeForNEON(MVT::v4i16); |
| addDRTypeForNEON(MVT::v2i32); |
| addDRTypeForNEON(MVT::v1i64); |
| addDRTypeForNEON(MVT::v1f64); |
| addDRTypeForNEON(MVT::v4f16); |
| if (Subtarget->hasBF16()) |
| addDRTypeForNEON(MVT::v4bf16); |
| |
| addQRTypeForNEON(MVT::v4f32); |
| addQRTypeForNEON(MVT::v2f64); |
| addQRTypeForNEON(MVT::v16i8); |
| addQRTypeForNEON(MVT::v8i16); |
| addQRTypeForNEON(MVT::v4i32); |
| addQRTypeForNEON(MVT::v2i64); |
| addQRTypeForNEON(MVT::v8f16); |
| if (Subtarget->hasBF16()) |
| addQRTypeForNEON(MVT::v8bf16); |
| } |
| |
| if (Subtarget->hasSVEorSME()) { |
| // Add legal sve predicate types |
| addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); |
| |
| // Add legal sve data types |
| addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); |
| |
| addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); |
| |
| if (Subtarget->hasBF16()) { |
| addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); |
| } |
| |
| if (Subtarget->useSVEForFixedLengthVectors()) { |
| for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addRegisterClass(VT, &AArch64::ZPRRegClass); |
| |
| for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addRegisterClass(VT, &AArch64::ZPRRegClass); |
| } |
| } |
| |
| if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { |
| addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass); |
| setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1); |
| setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1); |
| |
| setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand); |
| } |
| |
| // Compute derived properties from the register classes |
| computeRegisterProperties(Subtarget->getRegisterInfo()); |
| |
| // Provide all sorts of operation actions |
| setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); |
| setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); |
| setOperationAction(ISD::SETCC, MVT::i32, Custom); |
| setOperationAction(ISD::SETCC, MVT::i64, Custom); |
| setOperationAction(ISD::SETCC, MVT::f16, Custom); |
| setOperationAction(ISD::SETCC, MVT::f32, Custom); |
| setOperationAction(ISD::SETCC, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); |
| setOperationAction(ISD::BRCOND, MVT::Other, Custom); |
| setOperationAction(ISD::BR_CC, MVT::i32, Custom); |
| setOperationAction(ISD::BR_CC, MVT::i64, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f16, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f32, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f64, Custom); |
| setOperationAction(ISD::SELECT, MVT::i32, Custom); |
| setOperationAction(ISD::SELECT, MVT::i64, Custom); |
| setOperationAction(ISD::SELECT, MVT::f16, Custom); |
| setOperationAction(ISD::SELECT, MVT::bf16, Custom); |
| setOperationAction(ISD::SELECT, MVT::f32, Custom); |
| setOperationAction(ISD::SELECT, MVT::f64, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); |
| setOperationAction(ISD::BR_JT, MVT::Other, Custom); |
| setOperationAction(ISD::JumpTable, MVT::i64, Custom); |
| setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom); |
| |
| setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); |
| setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); |
| setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); |
| |
| setOperationAction(ISD::FREM, MVT::f32, Expand); |
| setOperationAction(ISD::FREM, MVT::f64, Expand); |
| setOperationAction(ISD::FREM, MVT::f80, Expand); |
| |
| setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); |
| |
| // Custom lowering hooks are needed for XOR |
| // to fold it into CSINC/CSINV. |
| setOperationAction(ISD::XOR, MVT::i32, Custom); |
| setOperationAction(ISD::XOR, MVT::i64, Custom); |
| |
| // Virtually no operation on f128 is legal, but LLVM can't expand them when |
| // there's a valid register class, so we need custom operations in most cases. |
| setOperationAction(ISD::FABS, MVT::f128, Expand); |
| setOperationAction(ISD::FADD, MVT::f128, LibCall); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); |
| setOperationAction(ISD::FCOS, MVT::f128, Expand); |
| setOperationAction(ISD::FDIV, MVT::f128, LibCall); |
| setOperationAction(ISD::FMA, MVT::f128, Expand); |
| setOperationAction(ISD::FMUL, MVT::f128, LibCall); |
| setOperationAction(ISD::FNEG, MVT::f128, Expand); |
| setOperationAction(ISD::FPOW, MVT::f128, Expand); |
| setOperationAction(ISD::FREM, MVT::f128, Expand); |
| setOperationAction(ISD::FRINT, MVT::f128, Expand); |
| setOperationAction(ISD::FSIN, MVT::f128, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f128, Expand); |
| setOperationAction(ISD::FSQRT, MVT::f128, Expand); |
| setOperationAction(ISD::FSUB, MVT::f128, LibCall); |
| setOperationAction(ISD::FTRUNC, MVT::f128, Expand); |
| setOperationAction(ISD::SETCC, MVT::f128, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f128, Custom); |
| setOperationAction(ISD::SELECT, MVT::f128, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); |
| setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); |
| // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently |
| // aren't handled. |
| |
| // Lowering for many of the conversions is actually specified by the non-f128 |
| // type. The LowerXXX function will be trivial when f128 isn't involved. |
| setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); |
| |
| setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); |
| |
| // Variable arguments. |
| setOperationAction(ISD::VASTART, MVT::Other, Custom); |
| setOperationAction(ISD::VAARG, MVT::Other, Custom); |
| setOperationAction(ISD::VACOPY, MVT::Other, Custom); |
| setOperationAction(ISD::VAEND, MVT::Other, Expand); |
| |
| // Variable-sized objects. |
| setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); |
| setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); |
| |
| // Lowering Funnel Shifts to EXTR |
| setOperationAction(ISD::FSHR, MVT::i32, Custom); |
| setOperationAction(ISD::FSHR, MVT::i64, Custom); |
| setOperationAction(ISD::FSHL, MVT::i32, Custom); |
| setOperationAction(ISD::FSHL, MVT::i64, Custom); |
| |
| if (Subtarget->isTargetWindows()) |
| setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); |
| else |
| setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); |
| |
| // Constant pool entries |
| setOperationAction(ISD::ConstantPool, MVT::i64, Custom); |
| |
| // BlockAddress |
| setOperationAction(ISD::BlockAddress, MVT::i64, Custom); |
| |
| // AArch64 lacks both left-rotate and popcount instructions. |
| setOperationAction(ISD::ROTL, MVT::i32, Expand); |
| setOperationAction(ISD::ROTL, MVT::i64, Expand); |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::ROTR, VT, Expand); |
| } |
| |
| // AArch64 doesn't have i32 MULH{S|U}. |
| setOperationAction(ISD::MULHU, MVT::i32, Expand); |
| setOperationAction(ISD::MULHS, MVT::i32, Expand); |
| |
| // AArch64 doesn't have {U|S}MUL_LOHI. |
| setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); |
| setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); |
| setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); |
| setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); |
| |
| if (Subtarget->hasCSSC()) { |
| setOperationAction(ISD::CTPOP, MVT::i32, Legal); |
| setOperationAction(ISD::CTPOP, MVT::i64, Legal); |
| setOperationAction(ISD::CTPOP, MVT::i128, Expand); |
| |
| setOperationAction(ISD::PARITY, MVT::i128, Expand); |
| |
| setOperationAction(ISD::CTTZ, MVT::i32, Legal); |
| setOperationAction(ISD::CTTZ, MVT::i64, Legal); |
| setOperationAction(ISD::CTTZ, MVT::i128, Expand); |
| |
| setOperationAction(ISD::ABS, MVT::i32, Legal); |
| setOperationAction(ISD::ABS, MVT::i64, Legal); |
| |
| setOperationAction(ISD::SMAX, MVT::i32, Legal); |
| setOperationAction(ISD::SMAX, MVT::i64, Legal); |
| setOperationAction(ISD::UMAX, MVT::i32, Legal); |
| setOperationAction(ISD::UMAX, MVT::i64, Legal); |
| |
| setOperationAction(ISD::SMIN, MVT::i32, Legal); |
| setOperationAction(ISD::SMIN, MVT::i64, Legal); |
| setOperationAction(ISD::UMIN, MVT::i32, Legal); |
| setOperationAction(ISD::UMIN, MVT::i64, Legal); |
| } else { |
| setOperationAction(ISD::CTPOP, MVT::i32, Custom); |
| setOperationAction(ISD::CTPOP, MVT::i64, Custom); |
| setOperationAction(ISD::CTPOP, MVT::i128, Custom); |
| |
| setOperationAction(ISD::PARITY, MVT::i64, Custom); |
| setOperationAction(ISD::PARITY, MVT::i128, Custom); |
| |
| setOperationAction(ISD::ABS, MVT::i32, Custom); |
| setOperationAction(ISD::ABS, MVT::i64, Custom); |
| } |
| |
| setOperationAction(ISD::SDIVREM, MVT::i32, Expand); |
| setOperationAction(ISD::SDIVREM, MVT::i64, Expand); |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::SDIVREM, VT, Expand); |
| setOperationAction(ISD::UDIVREM, VT, Expand); |
| } |
| setOperationAction(ISD::SREM, MVT::i32, Expand); |
| setOperationAction(ISD::SREM, MVT::i64, Expand); |
| setOperationAction(ISD::UDIVREM, MVT::i32, Expand); |
| setOperationAction(ISD::UDIVREM, MVT::i64, Expand); |
| setOperationAction(ISD::UREM, MVT::i32, Expand); |
| setOperationAction(ISD::UREM, MVT::i64, Expand); |
| |
| // Custom lower Add/Sub/Mul with overflow. |
| setOperationAction(ISD::SADDO, MVT::i32, Custom); |
| setOperationAction(ISD::SADDO, MVT::i64, Custom); |
| setOperationAction(ISD::UADDO, MVT::i32, Custom); |
| setOperationAction(ISD::UADDO, MVT::i64, Custom); |
| setOperationAction(ISD::SSUBO, MVT::i32, Custom); |
| setOperationAction(ISD::SSUBO, MVT::i64, Custom); |
| setOperationAction(ISD::USUBO, MVT::i32, Custom); |
| setOperationAction(ISD::USUBO, MVT::i64, Custom); |
| setOperationAction(ISD::SMULO, MVT::i32, Custom); |
| setOperationAction(ISD::SMULO, MVT::i64, Custom); |
| setOperationAction(ISD::UMULO, MVT::i32, Custom); |
| setOperationAction(ISD::UMULO, MVT::i64, Custom); |
| |
| setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom); |
| setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom); |
| setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom); |
| setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom); |
| setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom); |
| setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom); |
| setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom); |
| setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom); |
| |
| setOperationAction(ISD::FSIN, MVT::f32, Expand); |
| setOperationAction(ISD::FSIN, MVT::f64, Expand); |
| setOperationAction(ISD::FCOS, MVT::f32, Expand); |
| setOperationAction(ISD::FCOS, MVT::f64, Expand); |
| setOperationAction(ISD::FPOW, MVT::f32, Expand); |
| setOperationAction(ISD::FPOW, MVT::f64, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); |
| if (Subtarget->hasFullFP16()) |
| setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); |
| else |
| setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); |
| |
| for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, |
| ISD::FCOS, ISD::FSIN, ISD::FSINCOS, |
| ISD::FEXP, ISD::FEXP2, ISD::FEXP10, |
| ISD::FLOG, ISD::FLOG2, ISD::FLOG10, |
| ISD::STRICT_FREM, |
| ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS, |
| ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, |
| ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { |
| setOperationAction(Op, MVT::f16, Promote); |
| setOperationAction(Op, MVT::v4f16, Expand); |
| setOperationAction(Op, MVT::v8f16, Expand); |
| } |
| |
| if (!Subtarget->hasFullFP16()) { |
| for (auto Op : |
| {ISD::SETCC, ISD::SELECT_CC, |
| ISD::BR_CC, ISD::FADD, ISD::FSUB, |
| ISD::FMUL, ISD::FDIV, ISD::FMA, |
| ISD::FNEG, ISD::FABS, ISD::FCEIL, |
| ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, |
| ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, |
| ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, |
| ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, |
| ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, |
| ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, |
| ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, |
| ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, |
| ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, |
| ISD::STRICT_FMAXIMUM}) |
| setOperationAction(Op, MVT::f16, Promote); |
| |
| // Round-to-integer need custom lowering for fp16, as Promote doesn't work |
| // because the result type is integer. |
| for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT, |
| ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, |
| ISD::STRICT_LLRINT}) |
| setOperationAction(Op, MVT::f16, Custom); |
| |
| // promote v4f16 to v4f32 when that is known to be safe. |
| setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); |
| setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); |
| setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); |
| setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); |
| |
| setOperationAction(ISD::FABS, MVT::v4f16, Expand); |
| setOperationAction(ISD::FNEG, MVT::v4f16, Expand); |
| setOperationAction(ISD::FROUND, MVT::v4f16, Expand); |
| setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); |
| setOperationAction(ISD::FMA, MVT::v4f16, Expand); |
| setOperationAction(ISD::SETCC, MVT::v4f16, Custom); |
| setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); |
| setOperationAction(ISD::SELECT, MVT::v4f16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); |
| setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); |
| setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); |
| setOperationAction(ISD::FRINT, MVT::v4f16, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); |
| setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); |
| |
| setOperationAction(ISD::FABS, MVT::v8f16, Expand); |
| setOperationAction(ISD::FADD, MVT::v8f16, Expand); |
| setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); |
| setOperationAction(ISD::FDIV, MVT::v8f16, Expand); |
| setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); |
| setOperationAction(ISD::FMA, MVT::v8f16, Expand); |
| setOperationAction(ISD::FMUL, MVT::v8f16, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FNEG, MVT::v8f16, Expand); |
| setOperationAction(ISD::FROUND, MVT::v8f16, Expand); |
| setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); |
| setOperationAction(ISD::FRINT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSUB, MVT::v8f16, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); |
| setOperationAction(ISD::SETCC, MVT::v8f16, Expand); |
| setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); |
| setOperationAction(ISD::SELECT, MVT::v8f16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); |
| setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); |
| } |
| |
| // AArch64 has implementations of a lot of rounding-like FP operations. |
| for (auto Op : |
| {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, |
| ISD::FRINT, ISD::FTRUNC, ISD::FROUND, |
| ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM, |
| ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND, |
| ISD::LLROUND, ISD::LRINT, ISD::LLRINT, |
| ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT, |
| ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, |
| ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, |
| ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND, |
| ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) { |
| for (MVT Ty : {MVT::f32, MVT::f64}) |
| setOperationAction(Op, Ty, Legal); |
| if (Subtarget->hasFullFP16()) |
| setOperationAction(Op, MVT::f16, Legal); |
| } |
| |
| // Basic strict FP operations are legal |
| for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, |
| ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) { |
| for (MVT Ty : {MVT::f32, MVT::f64}) |
| setOperationAction(Op, Ty, Legal); |
| if (Subtarget->hasFullFP16()) |
| setOperationAction(Op, MVT::f16, Legal); |
| } |
| |
| // Strict conversion to a larger type is legal |
| for (auto VT : {MVT::f32, MVT::f64}) |
| setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); |
| |
| setOperationAction(ISD::PREFETCH, MVT::Other, Custom); |
| |
| setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); |
| setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); |
| |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); |
| if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) { |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall); |
| } else { |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand); |
| } |
| setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); |
| |
| // Generate outline atomics library calls only if LSE was not specified for |
| // subtarget |
| if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); |
| #define LCALLNAMES(A, B, N) \ |
| setLibcallName(A##N##_RELAX, #B #N "_relax"); \ |
| setLibcallName(A##N##_ACQ, #B #N "_acq"); \ |
| setLibcallName(A##N##_REL, #B #N "_rel"); \ |
| setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); |
| #define LCALLNAME4(A, B) \ |
| LCALLNAMES(A, B, 1) \ |
| LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) |
| #define LCALLNAME5(A, B) \ |
| LCALLNAMES(A, B, 1) \ |
| LCALLNAMES(A, B, 2) \ |
| LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) |
| LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) |
| #undef LCALLNAMES |
| #undef LCALLNAME4 |
| #undef LCALLNAME5 |
| } |
| |
| if (Subtarget->hasLSE128()) { |
| // Custom lowering because i128 is not legal. Must be replaced by 2x64 |
| // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP. |
| setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom); |
| } |
| |
| // 128-bit loads and stores can be done without expanding |
| setOperationAction(ISD::LOAD, MVT::i128, Custom); |
| setOperationAction(ISD::STORE, MVT::i128, Custom); |
| |
| // Aligned 128-bit loads and stores are single-copy atomic according to the |
| // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2. |
| if (Subtarget->hasLSE2()) { |
| setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); |
| setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); |
| } |
| |
| // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the |
| // custom lowering, as there are no un-paired non-temporal stores and |
| // legalization will break up 256 bit inputs. |
| setOperationAction(ISD::STORE, MVT::v32i8, Custom); |
| setOperationAction(ISD::STORE, MVT::v16i16, Custom); |
| setOperationAction(ISD::STORE, MVT::v16f16, Custom); |
| setOperationAction(ISD::STORE, MVT::v8i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v8f32, Custom); |
| setOperationAction(ISD::STORE, MVT::v4f64, Custom); |
| setOperationAction(ISD::STORE, MVT::v4i64, Custom); |
| |
| // 256 bit non-temporal loads can be lowered to LDNP. This is done using |
| // custom lowering, as there are no un-paired non-temporal loads legalization |
| // will break up 256 bit inputs. |
| setOperationAction(ISD::LOAD, MVT::v32i8, Custom); |
| setOperationAction(ISD::LOAD, MVT::v16i16, Custom); |
| setOperationAction(ISD::LOAD, MVT::v16f16, Custom); |
| setOperationAction(ISD::LOAD, MVT::v8i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v8f32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v4f64, Custom); |
| setOperationAction(ISD::LOAD, MVT::v4i64, Custom); |
| |
| // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. |
| setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); |
| |
| if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && |
| getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { |
| // Issue __sincos_stret if available. |
| setOperationAction(ISD::FSINCOS, MVT::f64, Custom); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Custom); |
| } else { |
| setOperationAction(ISD::FSINCOS, MVT::f64, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Expand); |
| } |
| |
| if (Subtarget->getTargetTriple().isOSMSVCRT()) { |
| // MSVCRT doesn't have powi; fall back to pow |
| setLibcallName(RTLIB::POWI_F32, nullptr); |
| setLibcallName(RTLIB::POWI_F64, nullptr); |
| } |
| |
| // Make floating-point constants legal for the large code model, so they don't |
| // become loads from the constant pool. |
| if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { |
| setOperationAction(ISD::ConstantFP, MVT::f32, Legal); |
| setOperationAction(ISD::ConstantFP, MVT::f64, Legal); |
| } |
| |
| // AArch64 does not have floating-point extending loads, i1 sign-extending |
| // load, floating-point truncating stores, or v2i32->v2i16 truncating store. |
| for (MVT VT : MVT::fp_valuetypes()) { |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); |
| } |
| for (MVT VT : MVT::integer_valuetypes()) |
| setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); |
| |
| setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f64, MVT::f32, Expand); |
| setTruncStoreAction(MVT::f64, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f80, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f64, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f32, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f16, Expand); |
| |
| setOperationAction(ISD::BITCAST, MVT::i16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::f16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::bf16, Custom); |
| |
| // Indexed loads and stores are supported. |
| for (unsigned im = (unsigned)ISD::PRE_INC; |
| im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { |
| setIndexedLoadAction(im, MVT::i8, Legal); |
| setIndexedLoadAction(im, MVT::i16, Legal); |
| setIndexedLoadAction(im, MVT::i32, Legal); |
| setIndexedLoadAction(im, MVT::i64, Legal); |
| setIndexedLoadAction(im, MVT::f64, Legal); |
| setIndexedLoadAction(im, MVT::f32, Legal); |
| setIndexedLoadAction(im, MVT::f16, Legal); |
| setIndexedLoadAction(im, MVT::bf16, Legal); |
| setIndexedStoreAction(im, MVT::i8, Legal); |
| setIndexedStoreAction(im, MVT::i16, Legal); |
| setIndexedStoreAction(im, MVT::i32, Legal); |
| setIndexedStoreAction(im, MVT::i64, Legal); |
| setIndexedStoreAction(im, MVT::f64, Legal); |
| setIndexedStoreAction(im, MVT::f32, Legal); |
| setIndexedStoreAction(im, MVT::f16, Legal); |
| setIndexedStoreAction(im, MVT::bf16, Legal); |
| } |
| |
| // Trap. |
| setOperationAction(ISD::TRAP, MVT::Other, Legal); |
| setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); |
| setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); |
| |
| // We combine OR nodes for bitfield operations. |
| setTargetDAGCombine(ISD::OR); |
| // Try to create BICs for vector ANDs. |
| setTargetDAGCombine(ISD::AND); |
| |
| // Vector add and sub nodes may conceal a high-half opportunity. |
| // Also, try to fold ADD into CSINC/CSINV.. |
| setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP, |
| ISD::UINT_TO_FP}); |
| |
| setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, |
| ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV}); |
| |
| // Try and combine setcc with csel |
| setTargetDAGCombine(ISD::SETCC); |
| |
| setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); |
| |
| setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, |
| ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, |
| ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, |
| ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); |
| setTargetDAGCombine(ISD::TRUNCATE); |
| setTargetDAGCombine(ISD::LOAD); |
| |
| setTargetDAGCombine(ISD::MSTORE); |
| |
| setTargetDAGCombine(ISD::MUL); |
| |
| setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); |
| |
| setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, |
| ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, |
| ISD::VECREDUCE_ADD, ISD::STEP_VECTOR}); |
| |
| setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER}); |
| |
| setTargetDAGCombine(ISD::FP_EXTEND); |
| |
| setTargetDAGCombine(ISD::GlobalAddress); |
| |
| setTargetDAGCombine(ISD::CTLZ); |
| |
| setTargetDAGCombine(ISD::VECREDUCE_AND); |
| setTargetDAGCombine(ISD::VECREDUCE_OR); |
| setTargetDAGCombine(ISD::VECREDUCE_XOR); |
| |
| setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); |
| |
| // In case of strict alignment, avoid an excessive number of byte wide stores. |
| MaxStoresPerMemsetOptSize = 8; |
| MaxStoresPerMemset = |
| Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; |
| |
| MaxGluedStoresPerMemcpy = 4; |
| MaxStoresPerMemcpyOptSize = 4; |
| MaxStoresPerMemcpy = |
| Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; |
| |
| MaxStoresPerMemmoveOptSize = 4; |
| MaxStoresPerMemmove = 4; |
| |
| MaxLoadsPerMemcmpOptSize = 4; |
| MaxLoadsPerMemcmp = |
| Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; |
| |
| setStackPointerRegisterToSaveRestore(AArch64::SP); |
| |
| setSchedulingPreference(Sched::Hybrid); |
| |
| EnableExtLdPromotion = true; |
| |
| // Set required alignment. |
| setMinFunctionAlignment(Align(4)); |
| // Set preferred alignments. |
| setPrefLoopAlignment(STI.getPrefLoopAlignment()); |
| setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment()); |
| setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); |
| |
| // Only change the limit for entries in a jump table if specified by |
| // the sub target, but not at the command line. |
| unsigned MaxJT = STI.getMaximumJumpTableSize(); |
| if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) |
| setMaximumJumpTableSize(MaxJT); |
| |
| setHasExtractBitsInsn(true); |
| |
| setMaxDivRemBitWidthSupported(128); |
| |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); |
| |
| if (Subtarget->hasNEON()) { |
| // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to |
| // silliness like this: |
| for (auto Op : |
| {ISD::SELECT, ISD::SELECT_CC, |
| ISD::BR_CC, ISD::FADD, ISD::FSUB, |
| ISD::FMUL, ISD::FDIV, ISD::FMA, |
| ISD::FNEG, ISD::FABS, ISD::FCEIL, |
| ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, |
| ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, |
| ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, |
| ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, |
| ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, |
| ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, |
| ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, |
| ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, |
| ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, |
| ISD::STRICT_FMAXIMUM}) |
| setOperationAction(Op, MVT::v1f64, Expand); |
| |
| for (auto Op : |
| {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, |
| ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL, |
| ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, |
| ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND}) |
| setOperationAction(Op, MVT::v1i64, Expand); |
| |
| // AArch64 doesn't have a direct vector ->f32 conversion instructions for |
| // elements smaller than i32, so promote the input to i32 first. |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); |
| |
| // Similarly, there is no direct i32 -> f64 vector conversion instruction. |
| // Or, direct i32 -> f16 vector conversion. Set it so custom, so the |
| // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 |
| for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, |
| ISD::STRICT_UINT_TO_FP}) |
| for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32}) |
| setOperationAction(Op, VT, Custom); |
| |
| if (Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::ConstantFP, MVT::f16, Legal); |
| setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); |
| |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); |
| } else { |
| // when AArch64 doesn't have fullfp16 support, promote the input |
| // to i32 first. |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); |
| } |
| |
| setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); |
| setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); |
| setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); |
| for (auto VT : {MVT::v1i64, MVT::v2i64}) { |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| } |
| |
| // Custom handling for some quad-vector types to detect MULL. |
| setOperationAction(ISD::MUL, MVT::v8i16, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i32, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i16, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i32, Custom); |
| setOperationAction(ISD::MUL, MVT::v1i64, Custom); |
| |
| // Saturates |
| for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, |
| MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SADDSAT, VT, Legal); |
| setOperationAction(ISD::UADDSAT, VT, Legal); |
| setOperationAction(ISD::SSUBSAT, VT, Legal); |
| setOperationAction(ISD::USUBSAT, VT, Legal); |
| } |
| |
| for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, |
| MVT::v4i32}) { |
| setOperationAction(ISD::AVGFLOORS, VT, Legal); |
| setOperationAction(ISD::AVGFLOORU, VT, Legal); |
| setOperationAction(ISD::AVGCEILS, VT, Legal); |
| setOperationAction(ISD::AVGCEILU, VT, Legal); |
| setOperationAction(ISD::ABDS, VT, Legal); |
| setOperationAction(ISD::ABDU, VT, Legal); |
| } |
| |
| // Vector reductions |
| for (MVT VT : { MVT::v4f16, MVT::v2f32, |
| MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { |
| if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal); |
| setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal); |
| setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal); |
| |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); |
| } |
| } |
| for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, |
| MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| } |
| setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom); |
| |
| setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); |
| // Likewise, narrowing and extending vector loads/stores aren't handled |
| // directly. |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); |
| |
| if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { |
| setOperationAction(ISD::MULHS, VT, Legal); |
| setOperationAction(ISD::MULHU, VT, Legal); |
| } else { |
| setOperationAction(ISD::MULHS, VT, Expand); |
| setOperationAction(ISD::MULHU, VT, Expand); |
| } |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| |
| setOperationAction(ISD::BSWAP, VT, Expand); |
| setOperationAction(ISD::CTTZ, VT, Expand); |
| |
| for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { |
| setTruncStoreAction(VT, InnerVT, Expand); |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); |
| } |
| } |
| |
| // AArch64 has implementations of a lot of rounding-like FP operations. |
| for (auto Op : |
| {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, |
| ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR, |
| ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT, |
| ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) { |
| for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) |
| setOperationAction(Op, Ty, Legal); |
| if (Subtarget->hasFullFP16()) |
| for (MVT Ty : {MVT::v4f16, MVT::v8f16}) |
| setOperationAction(Op, Ty, Legal); |
| } |
| |
| setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); |
| |
| setOperationAction(ISD::BITCAST, MVT::i2, Custom); |
| setOperationAction(ISD::BITCAST, MVT::i4, Custom); |
| setOperationAction(ISD::BITCAST, MVT::i8, Custom); |
| setOperationAction(ISD::BITCAST, MVT::i16, Custom); |
| |
| setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); |
| setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); |
| |
| setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| |
| // ADDP custom lowering |
| for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) |
| setOperationAction(ISD::ADD, VT, Custom); |
| // FADDP custom lowering |
| for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) |
| setOperationAction(ISD::FADD, VT, Custom); |
| } |
| |
| if (Subtarget->hasSME()) { |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); |
| } |
| |
| // FIXME: Move lowering for more nodes here if those are common between |
| // SVE and SME. |
| if (Subtarget->hasSVEorSME()) { |
| for (auto VT : |
| {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); |
| setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); |
| } |
| } |
| |
| if (Subtarget->hasSVEorSME()) { |
| for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::BSWAP, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::MULHS, VT, Custom); |
| setOperationAction(ISD::MULHU, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::ABS, VT, Custom); |
| setOperationAction(ISD::ABDS, VT, Custom); |
| setOperationAction(ISD::ABDU, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); |
| setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); |
| |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::ROTR, VT, Expand); |
| |
| setOperationAction(ISD::SADDSAT, VT, Legal); |
| setOperationAction(ISD::UADDSAT, VT, Legal); |
| setOperationAction(ISD::SSUBSAT, VT, Legal); |
| setOperationAction(ISD::USUBSAT, VT, Legal); |
| setOperationAction(ISD::UREM, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::SDIVREM, VT, Expand); |
| setOperationAction(ISD::UDIVREM, VT, Expand); |
| |
| setOperationAction(ISD::AVGFLOORS, VT, Custom); |
| setOperationAction(ISD::AVGFLOORU, VT, Custom); |
| setOperationAction(ISD::AVGCEILS, VT, Custom); |
| setOperationAction(ISD::AVGCEILU, VT, Custom); |
| |
| if (!Subtarget->isLittleEndian()) |
| setOperationAction(ISD::BITCAST, VT, Expand); |
| } |
| |
| // Illegal unpacked integer vector types. |
| for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| } |
| |
| // Legalize unpacked bitcasts to REINTERPRET_CAST. |
| for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, |
| MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) |
| setOperationAction(ISD::BITCAST, VT, Custom); |
| |
| for (auto VT : |
| { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, |
| MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); |
| |
| for (auto VT : |
| {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| |
| // There are no legal MVT::nxv16f## based types. |
| if (VT != MVT::nxv16i1) { |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| } |
| } |
| |
| // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does |
| for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, |
| MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, |
| MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MSTORE, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| } |
| |
| // Firstly, exclude all scalable vector extending loads/truncating stores, |
| // include both integer and floating scalable vector. |
| for (MVT VT : MVT::scalable_vector_valuetypes()) { |
| for (MVT InnerVT : MVT::scalable_vector_valuetypes()) { |
| setTruncStoreAction(VT, InnerVT, Expand); |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); |
| } |
| } |
| |
| // Then, selectively enable those which we directly support. |
| setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal); |
| setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal); |
| setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal); |
| setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal); |
| setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal); |
| setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal); |
| for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { |
| setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal); |
| setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal); |
| setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal); |
| setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal); |
| setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal); |
| setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal); |
| } |
| |
| // SVE supports truncating stores of 64 and 128-bit vectors |
| setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); |
| |
| for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, |
| MVT::nxv4f32, MVT::nxv2f64}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| setOperationAction(ISD::FDIV, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Custom); |
| setOperationAction(ISD::FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::FMAXNUM, VT, Custom); |
| setOperationAction(ISD::FMINIMUM, VT, Custom); |
| setOperationAction(ISD::FMINNUM, VT, Custom); |
| setOperationAction(ISD::FMUL, VT, Custom); |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FSUB, VT, Custom); |
| setOperationAction(ISD::FCEIL, VT, Custom); |
| setOperationAction(ISD::FFLOOR, VT, Custom); |
| setOperationAction(ISD::FNEARBYINT, VT, Custom); |
| setOperationAction(ISD::FRINT, VT, Custom); |
| setOperationAction(ISD::FROUND, VT, Custom); |
| setOperationAction(ISD::FROUNDEVEN, VT, Custom); |
| setOperationAction(ISD::FTRUNC, VT, Custom); |
| setOperationAction(ISD::FSQRT, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FP_EXTEND, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); |
| if (Subtarget->isSVEAvailable()) |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); |
| setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); |
| |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::FREM, VT, Expand); |
| setOperationAction(ISD::FPOW, VT, Expand); |
| setOperationAction(ISD::FPOWI, VT, Expand); |
| setOperationAction(ISD::FCOS, VT, Expand); |
| setOperationAction(ISD::FSIN, VT, Expand); |
| setOperationAction(ISD::FSINCOS, VT, Expand); |
| setOperationAction(ISD::FEXP, VT, Expand); |
| setOperationAction(ISD::FEXP2, VT, Expand); |
| setOperationAction(ISD::FEXP10, VT, Expand); |
| setOperationAction(ISD::FLOG, VT, Expand); |
| setOperationAction(ISD::FLOG2, VT, Expand); |
| setOperationAction(ISD::FLOG10, VT, Expand); |
| |
| setCondCodeAction(ISD::SETO, VT, Expand); |
| setCondCodeAction(ISD::SETOLT, VT, Expand); |
| setCondCodeAction(ISD::SETLT, VT, Expand); |
| setCondCodeAction(ISD::SETOLE, VT, Expand); |
| setCondCodeAction(ISD::SETLE, VT, Expand); |
| setCondCodeAction(ISD::SETULT, VT, Expand); |
| setCondCodeAction(ISD::SETULE, VT, Expand); |
| setCondCodeAction(ISD::SETUGE, VT, Expand); |
| setCondCodeAction(ISD::SETUGT, VT, Expand); |
| setCondCodeAction(ISD::SETUEQ, VT, Expand); |
| setCondCodeAction(ISD::SETONE, VT, Expand); |
| |
| if (!Subtarget->isLittleEndian()) |
| setOperationAction(ISD::BITCAST, VT, Expand); |
| } |
| |
| for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); |
| |
| if (!Subtarget->isLittleEndian()) |
| setOperationAction(ISD::BITCAST, VT, Expand); |
| } |
| |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); |
| |
| // NEON doesn't support integer divides, but SVE does |
| for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| MVT::v4i32, MVT::v1i64, MVT::v2i64}) { |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| } |
| |
| // NEON doesn't support 64-bit vector integer muls, but SVE does. |
| setOperationAction(ISD::MUL, MVT::v1i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i64, Custom); |
| |
| if (Subtarget->isSVEAvailable()) { |
| // NEON doesn't support across-vector reductions, but SVE does. |
| for (auto VT : |
| {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); |
| } |
| |
| if (!Subtarget->isNeonAvailable()) { |
| setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom); |
| setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom); |
| setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom); |
| setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom); |
| setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom); |
| setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom); |
| setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom); |
| setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom); |
| setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); |
| for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| MVT::v4i32, MVT::v1i64, MVT::v2i64}) |
| addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); |
| |
| for (MVT VT : |
| {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) |
| addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); |
| } |
| |
| // NOTE: Currently this has to happen after computeRegisterProperties rather |
| // than the preferred option of combining it with the addRegisterClass call. |
| if (Subtarget->useSVEForFixedLengthVectors()) { |
| for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); |
| for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); |
| |
| // 64bit results can mean a bigger than NEON input. |
| for (auto VT : {MVT::v8i8, MVT::v4i16}) |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); |
| |
| // 128bit results imply a bigger than NEON input. |
| for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| for (auto VT : {MVT::v8f16, MVT::v4f32}) |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| |
| // These operations are not supported on NEON but SVE can do them. |
| setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); |
| setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); |
| setOperationAction(ISD::MULHS, MVT::v1i64, Custom); |
| setOperationAction(ISD::MULHS, MVT::v2i64, Custom); |
| setOperationAction(ISD::MULHU, MVT::v1i64, Custom); |
| setOperationAction(ISD::MULHU, MVT::v2i64, Custom); |
| setOperationAction(ISD::SMAX, MVT::v1i64, Custom); |
| setOperationAction(ISD::SMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::SMIN, MVT::v1i64, Custom); |
| setOperationAction(ISD::SMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::UMAX, MVT::v1i64, Custom); |
| setOperationAction(ISD::UMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::UMIN, MVT::v1i64, Custom); |
| setOperationAction(ISD::UMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); |
| |
| // Int operations with no NEON support. |
| for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, |
| MVT::v2i32, MVT::v4i32, MVT::v2i64}) { |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| setOperationAction(ISD::MULHS, VT, Custom); |
| setOperationAction(ISD::MULHU, VT, Custom); |
| } |
| |
| |
| // Use SVE for vectors with more than 2 elements. |
| for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| } |
| |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); |
| |
| setOperationAction(ISD::VSCALE, MVT::i32, Custom); |
| } |
| |
| if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { |
| // Only required for llvm.aarch64.mops.memset.tag |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); |
| } |
| |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); |
| |
| PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); |
| |
| IsStrictFPEnabled = true; |
| } |
| |
| void AArch64TargetLowering::addTypeForNEON(MVT VT) { |
| assert(VT.isVector() && "VT should be a vector type"); |
| |
| if (VT.isFloatingPoint()) { |
| MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); |
| setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); |
| setOperationPromotedToType(ISD::STORE, VT, PromoteTo); |
| } |
| |
| // Mark vector float intrinsics as expand. |
| if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { |
| setOperationAction(ISD::FSIN, VT, Expand); |
| setOperationAction(ISD::FCOS, VT, Expand); |
| setOperationAction(ISD::FPOW, VT, Expand); |
| setOperationAction(ISD::FLOG, VT, Expand); |
| setOperationAction(ISD::FLOG2, VT, Expand); |
| setOperationAction(ISD::FLOG10, VT, Expand); |
| setOperationAction(ISD::FEXP, VT, Expand); |
| setOperationAction(ISD::FEXP2, VT, Expand); |
| setOperationAction(ISD::FEXP10, VT, Expand); |
| } |
| |
| // But we do support custom-lowering for FCOPYSIGN. |
| if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || |
| ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::OR, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); |
| |
| setOperationAction(ISD::SELECT, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::VSELECT, VT, Expand); |
| for (MVT InnerVT : MVT::all_valuetypes()) |
| setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); |
| |
| // CNT supports only B element sizes, then use UADDLP to widen. |
| if (VT != MVT::v8i8 && VT != MVT::v16i8) |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| |
| setOperationAction(ISD::UDIV, VT, Expand); |
| setOperationAction(ISD::SDIV, VT, Expand); |
| setOperationAction(ISD::UREM, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::FREM, VT, Expand); |
| |
| for (unsigned Opcode : |
| {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, |
| ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) |
| setOperationAction(Opcode, VT, Custom); |
| |
| if (!VT.isFloatingPoint()) |
| setOperationAction(ISD::ABS, VT, Legal); |
| |
| // [SU][MIN|MAX] are available for all NEON types apart from i64. |
| if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) |
| for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) |
| setOperationAction(Opcode, VT, Legal); |
| |
| // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP |
| // NEON types. |
| if (VT.isFloatingPoint() && |
| VT.getVectorElementType() != MVT::bf16 && |
| (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) |
| for (unsigned Opcode : |
| {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM, |
| ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, |
| ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, |
| ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, |
| ISD::STRICT_FSQRT}) |
| setOperationAction(Opcode, VT, Legal); |
| |
| // Strict fp extend and trunc are legal |
| if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16) |
| setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); |
| if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64) |
| setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); |
| |
| // FIXME: We could potentially make use of the vector comparison instructions |
| // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of |
| // complications: |
| // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons, |
| // so we would need to expand when the condition code doesn't match the |
| // kind of comparison. |
| // * Some kinds of comparison require more than one FCMXY instruction so |
| // would need to be expanded instead. |
| // * The lowering of the non-strict versions involves target-specific ISD |
| // nodes so we would likely need to add strict versions of all of them and |
| // handle them appropriately. |
| setOperationAction(ISD::STRICT_FSETCC, VT, Expand); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Expand); |
| |
| if (Subtarget->isLittleEndian()) { |
| for (unsigned im = (unsigned)ISD::PRE_INC; |
| im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { |
| setIndexedLoadAction(im, VT, Legal); |
| setIndexedStoreAction(im, VT, Legal); |
| } |
| } |
| |
| if (Subtarget->hasD128()) { |
| setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom); |
| setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom); |
| } |
| } |
| |
| bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, |
| EVT OpVT) const { |
| // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). |
| if (!Subtarget->hasSVE()) |
| return true; |
| |
| // We can only support legal predicate result types. We can use the SVE |
| // whilelo instruction for generating fixed-width predicates too. |
| if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && |
| ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && |
| ResVT != MVT::v8i1 && ResVT != MVT::v16i1) |
| return true; |
| |
| // The whilelo instruction only works with i32 or i64 scalar inputs. |
| if (OpVT != MVT::i32 && OpVT != MVT::i64) |
| return true; |
| |
| return false; |
| } |
| |
| void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, |
| bool StreamingSVE) { |
| assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); |
| |
| // By default everything must be expanded. |
| for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) |
| setOperationAction(Op, VT, Expand); |
| |
| if (VT.isFloatingPoint()) { |
| setCondCodeAction(ISD::SETO, VT, Expand); |
| setCondCodeAction(ISD::SETOLT, VT, Expand); |
| setCondCodeAction(ISD::SETOLE, VT, Expand); |
| setCondCodeAction(ISD::SETULT, VT, Expand); |
| setCondCodeAction(ISD::SETULE, VT, Expand); |
| setCondCodeAction(ISD::SETUGE, VT, Expand); |
| setCondCodeAction(ISD::SETUGT, VT, Expand); |
| setCondCodeAction(ISD::SETUEQ, VT, Expand); |
| setCondCodeAction(ISD::SETONE, VT, Expand); |
| } |
| |
| // Mark integer truncating stores/extending loads as having custom lowering |
| if (VT.isInteger()) { |
| MVT InnerVT = VT.changeVectorElementType(MVT::i8); |
| while (InnerVT != VT) { |
| setTruncStoreAction(VT, InnerVT, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); |
| InnerVT = InnerVT.changeVectorElementType( |
| MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); |
| } |
| } |
| |
| // Mark floating-point truncating stores/extending loads as having custom |
| // lowering |
| if (VT.isFloatingPoint()) { |
| MVT InnerVT = VT.changeVectorElementType(MVT::f16); |
| while (InnerVT != VT) { |
| setTruncStoreAction(VT, InnerVT, Custom); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); |
| InnerVT = InnerVT.changeVectorElementType( |
| MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); |
| } |
| } |
| |
| // Lower fixed length vector operations to scalable equivalents. |
| setOperationAction(ISD::ABS, VT, Custom); |
| setOperationAction(ISD::ADD, VT, Custom); |
| setOperationAction(ISD::AND, VT, Custom); |
| setOperationAction(ISD::ANY_EXTEND, VT, Custom); |
| setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom); |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::BSWAP, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::FCEIL, VT, Custom); |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| setOperationAction(ISD::FDIV, VT, Custom); |
| setOperationAction(ISD::FFLOOR, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Custom); |
| setOperationAction(ISD::FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::FMAXNUM, VT, Custom); |
| setOperationAction(ISD::FMINIMUM, VT, Custom); |
| setOperationAction(ISD::FMINNUM, VT, Custom); |
| setOperationAction(ISD::FMUL, VT, Custom); |
| setOperationAction(ISD::FNEARBYINT, VT, Custom); |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FP_EXTEND, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::FRINT, VT, Custom); |
| setOperationAction(ISD::FROUND, VT, Custom); |
| setOperationAction(ISD::FROUNDEVEN, VT, Custom); |
| setOperationAction(ISD::FSQRT, VT, Custom); |
| setOperationAction(ISD::FSUB, VT, Custom); |
| setOperationAction(ISD::FTRUNC, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom); |
| setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom); |
| setOperationAction(ISD::MSTORE, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::MULHS, VT, Custom); |
| setOperationAction(ISD::MULHU, VT, Custom); |
| setOperationAction(ISD::OR, VT, Custom); |
| setOperationAction(ISD::SCALAR_TO_VECTOR, VT, StreamingSVE ? Legal : Expand); |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, VT, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom); |
| setOperationAction(ISD::SUB, VT, Custom); |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, |
| StreamingSVE ? Expand : Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::XOR, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, VT, Custom); |
| } |
| |
| void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { |
| addRegisterClass(VT, &AArch64::FPR64RegClass); |
| addTypeForNEON(VT); |
| } |
| |
| void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { |
| addRegisterClass(VT, &AArch64::FPR128RegClass); |
| addTypeForNEON(VT); |
| } |
| |
| EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, |
| LLVMContext &C, EVT VT) const { |
| if (!VT.isVector()) |
| return MVT::i32; |
| if (VT.isScalableVector()) |
| return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); |
| return VT.changeVectorElementTypeToInteger(); |
| } |
| |
| // isIntImmediate - This method tests to see if the node is a constant |
| // operand. If so Imm will receive the value. |
| static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { |
| if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) { |
| Imm = C->getZExtValue(); |
| return true; |
| } |
| return false; |
| } |
| |
| // isOpcWithIntImmediate - This method tests to see if the node is a specific |
| // opcode and that it has a immediate integer right operand. |
| // If so Imm will receive the value. |
| static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, |
| uint64_t &Imm) { |
| return N->getOpcode() == Opc && |
| isIntImmediate(N->getOperand(1).getNode(), Imm); |
| } |
| |
| static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, |
| const APInt &Demanded, |
| TargetLowering::TargetLoweringOpt &TLO, |
| unsigned NewOpc) { |
| uint64_t OldImm = Imm, NewImm, Enc; |
| uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; |
| |
| // Return if the immediate is already all zeros, all ones, a bimm32 or a |
| // bimm64. |
| if (Imm == 0 || Imm == Mask || |
| AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) |
| return false; |
| |
| unsigned EltSize = Size; |
| uint64_t DemandedBits = Demanded.getZExtValue(); |
| |
| // Clear bits that are not demanded. |
| Imm &= DemandedBits; |
| |
| while (true) { |
| // The goal here is to set the non-demanded bits in a way that minimizes |
| // the number of switching between 0 and 1. In order to achieve this goal, |
| // we set the non-demanded bits to the value of the preceding demanded bits. |
| // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a |
| // non-demanded bit), we copy bit0 (1) to the least significant 'x', |
| // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. |
| // The final result is 0b11000011. |
| uint64_t NonDemandedBits = ~DemandedBits; |
| uint64_t InvertedImm = ~Imm & DemandedBits; |
| uint64_t RotatedImm = |
| ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & |
| NonDemandedBits; |
| uint64_t Sum = RotatedImm + NonDemandedBits; |
| bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); |
| uint64_t Ones = (Sum + Carry) & NonDemandedBits; |
| NewImm = (Imm | Ones) & Mask; |
| |
| // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate |
| // or all-ones or all-zeros, in which case we can stop searching. Otherwise, |
| // we halve the element size and continue the search. |
| if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) |
| break; |
| |
| // We cannot shrink the element size any further if it is 2-bits. |
| if (EltSize == 2) |
| return false; |
| |
| EltSize /= 2; |
| Mask >>= EltSize; |
| uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; |
| |
| // Return if there is mismatch in any of the demanded bits of Imm and Hi. |
| if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) |
| return false; |
| |
| // Merge the upper and lower halves of Imm and DemandedBits. |
| Imm |= Hi; |
| DemandedBits |= DemandedBitsHi; |
| } |
| |
| ++NumOptimizedImms; |
| |
| // Replicate the element across the register width. |
| while (EltSize < Size) { |
| NewImm |= NewImm << EltSize; |
| EltSize *= 2; |
| } |
| |
| (void)OldImm; |
| assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && |
| "demanded bits should never be altered"); |
| assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); |
| |
| // Create the new constant immediate node. |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| SDValue New; |
| |
| // If the new constant immediate is all-zeros or all-ones, let the target |
| // independent DAG combine optimize this node. |
| if (NewImm == 0 || NewImm == OrigMask) { |
| New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), |
| TLO.DAG.getConstant(NewImm, DL, VT)); |
| // Otherwise, create a machine node so that target independent DAG combine |
| // doesn't undo this optimization. |
| } else { |
| Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); |
| SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); |
| New = SDValue( |
| TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); |
| } |
| |
| return TLO.CombineTo(Op, New); |
| } |
| |
| bool AArch64TargetLowering::targetShrinkDemandedConstant( |
| SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, |
| TargetLoweringOpt &TLO) const { |
| // Delay this optimization to as late as possible. |
| if (!TLO.LegalOps) |
| return false; |
| |
| if (!EnableOptimizeLogicalImm) |
| return false; |
| |
| EVT VT = Op.getValueType(); |
| if (VT.isVector()) |
| return false; |
| |
| unsigned Size = VT.getSizeInBits(); |
| assert((Size == 32 || Size == 64) && |
| "i32 or i64 is expected after legalization."); |
| |
| // Exit early if we demand all bits. |
| if (DemandedBits.popcount() == Size) |
| return false; |
| |
| unsigned NewOpc; |
| switch (Op.getOpcode()) { |
| default: |
| return false; |
| case ISD::AND: |
| NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; |
| break; |
| case ISD::OR: |
| NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; |
| break; |
| case ISD::XOR: |
| NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; |
| break; |
| } |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); |
| if (!C) |
| return false; |
| uint64_t Imm = C->getZExtValue(); |
| return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); |
| } |
| |
| /// computeKnownBitsForTargetNode - Determine which of the bits specified in |
| /// Mask are known to be either zero or one and return them Known. |
| void AArch64TargetLowering::computeKnownBitsForTargetNode( |
| const SDValue Op, KnownBits &Known, const APInt &DemandedElts, |
| const SelectionDAG &DAG, unsigned Depth) const { |
| switch (Op.getOpcode()) { |
| default: |
| break; |
| case AArch64ISD::DUP: { |
| SDValue SrcOp = Op.getOperand(0); |
| Known = DAG.computeKnownBits(SrcOp, Depth + 1); |
| if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) { |
| assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() && |
| "Expected DUP implicit truncation"); |
| Known = Known.trunc(Op.getScalarValueSizeInBits()); |
| } |
| break; |
| } |
| case AArch64ISD::CSEL: { |
| KnownBits Known2; |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); |
| Known = Known.intersectWith(Known2); |
| break; |
| } |
| case AArch64ISD::BICi: { |
| // Compute the bit cleared value. |
| uint64_t Mask = |
| ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); |
| break; |
| } |
| case AArch64ISD::VLSHR: { |
| KnownBits Known2; |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); |
| Known = KnownBits::lshr(Known, Known2); |
| break; |
| } |
| case AArch64ISD::VASHR: { |
| KnownBits Known2; |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); |
| Known = KnownBits::ashr(Known, Known2); |
| break; |
| } |
| case AArch64ISD::VSHL: { |
| KnownBits Known2; |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); |
| Known = KnownBits::shl(Known, Known2); |
| break; |
| } |
| case AArch64ISD::MOVI: { |
| ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0)); |
| Known = |
| KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue())); |
| break; |
| } |
| case AArch64ISD::LOADgot: |
| case AArch64ISD::ADDlow: { |
| if (!Subtarget->isTargetILP32()) |
| break; |
| // In ILP32 mode all valid pointers are in the low 4GB of the address-space. |
| Known.Zero = APInt::getHighBitsSet(64, 32); |
| break; |
| } |
| case AArch64ISD::ASSERT_ZEXT_BOOL: { |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known.Zero |= APInt(Known.getBitWidth(), 0xFE); |
| break; |
| } |
| case ISD::INTRINSIC_W_CHAIN: { |
| ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); |
| Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); |
| switch (IntID) { |
| default: return; |
| case Intrinsic::aarch64_ldaxr: |
| case Intrinsic::aarch64_ldxr: { |
| unsigned BitWidth = Known.getBitWidth(); |
| EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); |
| unsigned MemBits = VT.getScalarSizeInBits(); |
| Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); |
| return; |
| } |
| } |
| break; |
| } |
| case ISD::INTRINSIC_WO_CHAIN: |
| case ISD::INTRINSIC_VOID: { |
| unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| switch (IntNo) { |
| default: |
| break; |
| case Intrinsic::aarch64_neon_uaddlv: { |
| MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); |
| unsigned BitWidth = Known.getBitWidth(); |
| if (VT == MVT::v8i8 || VT == MVT::v16i8) { |
| unsigned Bound = (VT == MVT::v8i8) ? 11 : 12; |
| assert(BitWidth >= Bound && "Unexpected width!"); |
| APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound); |
| Known.Zero |= Mask; |
| } |
| break; |
| } |
| case Intrinsic::aarch64_neon_umaxv: |
| case Intrinsic::aarch64_neon_uminv: { |
| // Figure out the datatype of the vector operand. The UMINV instruction |
| // will zero extend the result, so we can mark as known zero all the |
| // bits larger than the element datatype. 32-bit or larget doesn't need |
| // this as those are legal types and will be handled by isel directly. |
| MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); |
| unsigned BitWidth = Known.getBitWidth(); |
| if (VT == MVT::v8i8 || VT == MVT::v16i8) { |
| assert(BitWidth >= 8 && "Unexpected width!"); |
| APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); |
| Known.Zero |= Mask; |
| } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { |
| assert(BitWidth >= 16 && "Unexpected width!"); |
| APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); |
| Known.Zero |= Mask; |
| } |
| break; |
| } break; |
| } |
| } |
| } |
| } |
| |
| unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode( |
| SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, |
| unsigned Depth) const { |
| EVT VT = Op.getValueType(); |
| unsigned VTBits = VT.getScalarSizeInBits(); |
| unsigned Opcode = Op.getOpcode(); |
| switch (Opcode) { |
| case AArch64ISD::CMEQ: |
| case AArch64ISD::CMGE: |
| case AArch64ISD::CMGT: |
| case AArch64ISD::CMHI: |
| case AArch64ISD::CMHS: |
| case AArch64ISD::FCMEQ: |
| case AArch64ISD::FCMGE: |
| case AArch64ISD::FCMGT: |
| case AArch64ISD::CMEQz: |
| case AArch64ISD::CMGEz: |
| case AArch64ISD::CMGTz: |
| case AArch64ISD::CMLEz: |
| case AArch64ISD::CMLTz: |
| case AArch64ISD::FCMEQz: |
| case AArch64ISD::FCMGEz: |
| case AArch64ISD::FCMGTz: |
| case AArch64ISD::FCMLEz: |
| case AArch64ISD::FCMLTz: |
| // Compares return either 0 or all-ones |
| return VTBits; |
| } |
| |
| return 1; |
| } |
| |
| MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, |
| EVT) const { |
| return MVT::i64; |
| } |
| |
| bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( |
| EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| unsigned *Fast) const { |
| if (Subtarget->requiresStrictAlign()) |
| return false; |
| |
| if (Fast) { |
| // Some CPUs are fine with unaligned stores except for 128-bit ones. |
| *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || |
| // See comments in performSTORECombine() for more details about |
| // these conditions. |
| |
| // Code that uses clang vector extensions can mark that it |
| // wants unaligned accesses to be treated as fast by |
| // underspecifying alignment to be 1 or 2. |
| Alignment <= 2 || |
| |
| // Disregard v2i64. Memcpy lowering produces those and splitting |
| // them regresses performance on micro-benchmarks and olden/bh. |
| VT == MVT::v2i64; |
| } |
| return true; |
| } |
| |
| // Same as above but handling LLTs instead. |
| bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( |
| LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| unsigned *Fast) const { |
| if (Subtarget->requiresStrictAlign()) |
| return false; |
| |
| if (Fast) { |
| // Some CPUs are fine with unaligned stores except for 128-bit ones. |
| *Fast = !Subtarget->isMisaligned128StoreSlow() || |
| Ty.getSizeInBytes() != 16 || |
| // See comments in performSTORECombine() for more details about |
| // these conditions. |
| |
| // Code that uses clang vector extensions can mark that it |
| // wants unaligned accesses to be treated as fast by |
| // underspecifying alignment to be 1 or 2. |
| Alignment <= 2 || |
| |
| // Disregard v2i64. Memcpy lowering produces those and splitting |
| // them regresses performance on micro-benchmarks and olden/bh. |
| Ty == LLT::fixed_vector(2, 64); |
| } |
| return true; |
| } |
| |
| FastISel * |
| AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, |
| const TargetLibraryInfo *libInfo) const { |
| return AArch64::createFastISel(funcInfo, libInfo); |
| } |
| |
| const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { |
| #define MAKE_CASE(V) \ |
| case V: \ |
| return #V; |
| switch ((AArch64ISD::NodeType)Opcode) { |
| case AArch64ISD::FIRST_NUMBER: |
| break; |
| MAKE_CASE(AArch64ISD::SMSTART) |
| MAKE_CASE(AArch64ISD::SMSTOP) |
| MAKE_CASE(AArch64ISD::RESTORE_ZA) |
| MAKE_CASE(AArch64ISD::CALL) |
| MAKE_CASE(AArch64ISD::ADRP) |
| MAKE_CASE(AArch64ISD::ADR) |
| MAKE_CASE(AArch64ISD::ADDlow) |
| MAKE_CASE(AArch64ISD::LOADgot) |
| MAKE_CASE(AArch64ISD::RET_GLUE) |
| MAKE_CASE(AArch64ISD::BRCOND) |
| MAKE_CASE(AArch64ISD::CSEL) |
| MAKE_CASE(AArch64ISD::CSINV) |
| MAKE_CASE(AArch64ISD::CSNEG) |
| MAKE_CASE(AArch64ISD::CSINC) |
| MAKE_CASE(AArch64ISD::THREAD_POINTER) |
| MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) |
| MAKE_CASE(AArch64ISD::ABDS_PRED) |
| MAKE_CASE(AArch64ISD::ABDU_PRED) |
| MAKE_CASE(AArch64ISD::HADDS_PRED) |
| MAKE_CASE(AArch64ISD::HADDU_PRED) |
| MAKE_CASE(AArch64ISD::MUL_PRED) |
| MAKE_CASE(AArch64ISD::MULHS_PRED) |
| MAKE_CASE(AArch64ISD::MULHU_PRED) |
| MAKE_CASE(AArch64ISD::RHADDS_PRED) |
| MAKE_CASE(AArch64ISD::RHADDU_PRED) |
| MAKE_CASE(AArch64ISD::SDIV_PRED) |
| MAKE_CASE(AArch64ISD::SHL_PRED) |
| MAKE_CASE(AArch64ISD::SMAX_PRED) |
| MAKE_CASE(AArch64ISD::SMIN_PRED) |
| MAKE_CASE(AArch64ISD::SRA_PRED) |
| MAKE_CASE(AArch64ISD::SRL_PRED) |
| MAKE_CASE(AArch64ISD::UDIV_PRED) |
| MAKE_CASE(AArch64ISD::UMAX_PRED) |
| MAKE_CASE(AArch64ISD::UMIN_PRED) |
| MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) |
| MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::ADC) |
| MAKE_CASE(AArch64ISD::SBC) |
| MAKE_CASE(AArch64ISD::ADDS) |
| MAKE_CASE(AArch64ISD::SUBS) |
| MAKE_CASE(AArch64ISD::ADCS) |
| MAKE_CASE(AArch64ISD::SBCS) |
| MAKE_CASE(AArch64ISD::ANDS) |
| MAKE_CASE(AArch64ISD::CCMP) |
| MAKE_CASE(AArch64ISD::CCMN) |
| MAKE_CASE(AArch64ISD::FCCMP) |
| MAKE_CASE(AArch64ISD::FCMP) |
| MAKE_CASE(AArch64ISD::STRICT_FCMP) |
| MAKE_CASE(AArch64ISD::STRICT_FCMPE) |
| MAKE_CASE(AArch64ISD::DUP) |
| MAKE_CASE(AArch64ISD::DUPLANE8) |
| MAKE_CASE(AArch64ISD::DUPLANE16) |
| MAKE_CASE(AArch64ISD::DUPLANE32) |
| MAKE_CASE(AArch64ISD::DUPLANE64) |
| MAKE_CASE(AArch64ISD::DUPLANE128) |
| MAKE_CASE(AArch64ISD::MOVI) |
| MAKE_CASE(AArch64ISD::MOVIshift) |
| MAKE_CASE(AArch64ISD::MOVIedit) |
| MAKE_CASE(AArch64ISD::MOVImsl) |
| MAKE_CASE(AArch64ISD::FMOV) |
| MAKE_CASE(AArch64ISD::MVNIshift) |
| MAKE_CASE(AArch64ISD::MVNImsl) |
| MAKE_CASE(AArch64ISD::BICi) |
| MAKE_CASE(AArch64ISD::ORRi) |
| MAKE_CASE(AArch64ISD::BSP) |
| MAKE_CASE(AArch64ISD::ZIP1) |
| MAKE_CASE(AArch64ISD::ZIP2) |
| MAKE_CASE(AArch64ISD::UZP1) |
| MAKE_CASE(AArch64ISD::UZP2) |
| MAKE_CASE(AArch64ISD::TRN1) |
| MAKE_CASE(AArch64ISD::TRN2) |
| MAKE_CASE(AArch64ISD::REV16) |
| MAKE_CASE(AArch64ISD::REV32) |
| MAKE_CASE(AArch64ISD::REV64) |
| MAKE_CASE(AArch64ISD::EXT) |
| MAKE_CASE(AArch64ISD::SPLICE) |
| MAKE_CASE(AArch64ISD::VSHL) |
| MAKE_CASE(AArch64ISD::VLSHR) |
| MAKE_CASE(AArch64ISD::VASHR) |
| MAKE_CASE(AArch64ISD::VSLI) |
| MAKE_CASE(AArch64ISD::VSRI) |
| MAKE_CASE(AArch64ISD::CMEQ) |
| MAKE_CASE(AArch64ISD::CMGE) |
| MAKE_CASE(AArch64ISD::CMGT) |
| MAKE_CASE(AArch64ISD::CMHI) |
| MAKE_CASE(AArch64ISD::CMHS) |
| MAKE_CASE(AArch64ISD::FCMEQ) |
| MAKE_CASE(AArch64ISD::FCMGE) |
| MAKE_CASE(AArch64ISD::FCMGT) |
| MAKE_CASE(AArch64ISD::CMEQz) |
| MAKE_CASE(AArch64ISD::CMGEz) |
| MAKE_CASE(AArch64ISD::CMGTz) |
| MAKE_CASE(AArch64ISD::CMLEz) |
| MAKE_CASE(AArch64ISD::CMLTz) |
| MAKE_CASE(AArch64ISD::FCMEQz) |
| MAKE_CASE(AArch64ISD::FCMGEz) |
| MAKE_CASE(AArch64ISD::FCMGTz) |
| MAKE_CASE(AArch64ISD::FCMLEz) |
| MAKE_CASE(AArch64ISD::FCMLTz) |
| MAKE_CASE(AArch64ISD::SADDV) |
| MAKE_CASE(AArch64ISD::UADDV) |
| MAKE_CASE(AArch64ISD::UADDLV) |
| MAKE_CASE(AArch64ISD::SDOT) |
| MAKE_CASE(AArch64ISD::UDOT) |
| MAKE_CASE(AArch64ISD::SMINV) |
| MAKE_CASE(AArch64ISD::UMINV) |
| MAKE_CASE(AArch64ISD::SMAXV) |
| MAKE_CASE(AArch64ISD::UMAXV) |
| MAKE_CASE(AArch64ISD::SADDV_PRED) |
| MAKE_CASE(AArch64ISD::UADDV_PRED) |
| MAKE_CASE(AArch64ISD::SMAXV_PRED) |
| MAKE_CASE(AArch64ISD::UMAXV_PRED) |
| MAKE_CASE(AArch64ISD::SMINV_PRED) |
| MAKE_CASE(AArch64ISD::UMINV_PRED) |
| MAKE_CASE(AArch64ISD::ORV_PRED) |
| MAKE_CASE(AArch64ISD::EORV_PRED) |
| MAKE_CASE(AArch64ISD::ANDV_PRED) |
| MAKE_CASE(AArch64ISD::CLASTA_N) |
| MAKE_CASE(AArch64ISD::CLASTB_N) |
| MAKE_CASE(AArch64ISD::LASTA) |
| MAKE_CASE(AArch64ISD::LASTB) |
| MAKE_CASE(AArch64ISD::REINTERPRET_CAST) |
| MAKE_CASE(AArch64ISD::LS64_BUILD) |
| MAKE_CASE(AArch64ISD::LS64_EXTRACT) |
| MAKE_CASE(AArch64ISD::TBL) |
| MAKE_CASE(AArch64ISD::FADD_PRED) |
| MAKE_CASE(AArch64ISD::FADDA_PRED) |
| MAKE_CASE(AArch64ISD::FADDV_PRED) |
| MAKE_CASE(AArch64ISD::FDIV_PRED) |
| MAKE_CASE(AArch64ISD::FMA_PRED) |
| MAKE_CASE(AArch64ISD::FMAX_PRED) |
| MAKE_CASE(AArch64ISD::FMAXV_PRED) |
| MAKE_CASE(AArch64ISD::FMAXNM_PRED) |
| MAKE_CASE(AArch64ISD::FMAXNMV_PRED) |
| MAKE_CASE(AArch64ISD::FMIN_PRED) |
| MAKE_CASE(AArch64ISD::FMINV_PRED) |
| MAKE_CASE(AArch64ISD::FMINNM_PRED) |
| MAKE_CASE(AArch64ISD::FMINNMV_PRED) |
| MAKE_CASE(AArch64ISD::FMUL_PRED) |
| MAKE_CASE(AArch64ISD::FSUB_PRED) |
| MAKE_CASE(AArch64ISD::RDSVL) |
| MAKE_CASE(AArch64ISD::BIC) |
| MAKE_CASE(AArch64ISD::BIT) |
| MAKE_CASE(AArch64ISD::CBZ) |
| MAKE_CASE(AArch64ISD::CBNZ) |
| MAKE_CASE(AArch64ISD::TBZ) |
| MAKE_CASE(AArch64ISD::TBNZ) |
| MAKE_CASE(AArch64ISD::TC_RETURN) |
| MAKE_CASE(AArch64ISD::PREFETCH) |
| MAKE_CASE(AArch64ISD::SITOF) |
| MAKE_CASE(AArch64ISD::UITOF) |
| MAKE_CASE(AArch64ISD::NVCAST) |
| MAKE_CASE(AArch64ISD::MRS) |
| MAKE_CASE(AArch64ISD::SQSHL_I) |
| MAKE_CASE(AArch64ISD::UQSHL_I) |
| MAKE_CASE(AArch64ISD::SRSHR_I) |
| MAKE_CASE(AArch64ISD::URSHR_I) |
| MAKE_CASE(AArch64ISD::SQSHLU_I) |
| MAKE_CASE(AArch64ISD::WrapperLarge) |
| MAKE_CASE(AArch64ISD::LD2post) |
| MAKE_CASE(AArch64ISD::LD3post) |
| MAKE_CASE(AArch64ISD::LD4post) |
| MAKE_CASE(AArch64ISD::ST2post) |
| MAKE_CASE(AArch64ISD::ST3post) |
| MAKE_CASE(AArch64ISD::ST4post) |
| MAKE_CASE(AArch64ISD::LD1x2post) |
| MAKE_CASE(AArch64ISD::LD1x3post) |
| MAKE_CASE(AArch64ISD::LD1x4post) |
| MAKE_CASE(AArch64ISD::ST1x2post) |
| MAKE_CASE(AArch64ISD::ST1x3post) |
| MAKE_CASE(AArch64ISD::ST1x4post) |
| MAKE_CASE(AArch64ISD::LD1DUPpost) |
| MAKE_CASE(AArch64ISD::LD2DUPpost) |
| MAKE_CASE(AArch64ISD::LD3DUPpost) |
| MAKE_CASE(AArch64ISD::LD4DUPpost) |
| MAKE_CASE(AArch64ISD::LD1LANEpost) |
| MAKE_CASE(AArch64ISD::LD2LANEpost) |
| MAKE_CASE(AArch64ISD::LD3LANEpost) |
| MAKE_CASE(AArch64ISD::LD4LANEpost) |
| MAKE_CASE(AArch64ISD::ST2LANEpost) |
| MAKE_CASE(AArch64ISD::ST3LANEpost) |
| MAKE_CASE(AArch64ISD::ST4LANEpost) |
| MAKE_CASE(AArch64ISD::SMULL) |
| MAKE_CASE(AArch64ISD::UMULL) |
| MAKE_CASE(AArch64ISD::PMULL) |
| MAKE_CASE(AArch64ISD::FRECPE) |
| MAKE_CASE(AArch64ISD::FRECPS) |
| MAKE_CASE(AArch64ISD::FRSQRTE) |
| MAKE_CASE(AArch64ISD::FRSQRTS) |
| MAKE_CASE(AArch64ISD::STG) |
| MAKE_CASE(AArch64ISD::STZG) |
| MAKE_CASE(AArch64ISD::ST2G) |
| MAKE_CASE(AArch64ISD::STZ2G) |
| MAKE_CASE(AArch64ISD::SUNPKHI) |
| MAKE_CASE(AArch64ISD::SUNPKLO) |
| MAKE_CASE(AArch64ISD::UUNPKHI) |
| MAKE_CASE(AArch64ISD::UUNPKLO) |
| MAKE_CASE(AArch64ISD::INSR) |
| MAKE_CASE(AArch64ISD::PTEST) |
| MAKE_CASE(AArch64ISD::PTEST_ANY) |
| MAKE_CASE(AArch64ISD::PTRUE) |
| MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::ST1_PRED) |
| MAKE_CASE(AArch64ISD::SST1_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) |
| MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_IMM_PRED) |
| MAKE_CASE(AArch64ISD::SSTNT1_PRED) |
| MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) |
| MAKE_CASE(AArch64ISD::LDP) |
| MAKE_CASE(AArch64ISD::LDIAPP) |
| MAKE_CASE(AArch64ISD::LDNP) |
| MAKE_CASE(AArch64ISD::STP) |
| MAKE_CASE(AArch64ISD::STILP) |
| MAKE_CASE(AArch64ISD::STNP) |
| MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::INDEX_VECTOR) |
| MAKE_CASE(AArch64ISD::ADDP) |
| MAKE_CASE(AArch64ISD::SADDLP) |
| MAKE_CASE(AArch64ISD::UADDLP) |
| MAKE_CASE(AArch64ISD::CALL_RVMARKER) |
| MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) |
| MAKE_CASE(AArch64ISD::MOPS_MEMSET) |
| MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) |
| MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) |
| MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) |
| MAKE_CASE(AArch64ISD::CALL_BTI) |
| MAKE_CASE(AArch64ISD::MRRS) |
| MAKE_CASE(AArch64ISD::MSRR) |
| MAKE_CASE(AArch64ISD::RSHRNB_I) |
| } |
| #undef MAKE_CASE |
| return nullptr; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| // We materialise the F128CSEL pseudo-instruction as some control flow and a |
| // phi node: |
| |
| // OrigBB: |
| // [... previous instrs leading to comparison ...] |
| // b.ne TrueBB |
| // b EndBB |
| // TrueBB: |
| // ; Fallthrough |
| // EndBB: |
| // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] |
| |
| MachineFunction *MF = MBB->getParent(); |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| const BasicBlock *LLVM_BB = MBB->getBasicBlock(); |
| DebugLoc DL = MI.getDebugLoc(); |
| MachineFunction::iterator It = ++MBB->getIterator(); |
| |
| Register DestReg = MI.getOperand(0).getReg(); |
| Register IfTrueReg = MI.getOperand(1).getReg(); |
| Register IfFalseReg = MI.getOperand(2).getReg(); |
| unsigned CondCode = MI.getOperand(3).getImm(); |
| bool NZCVKilled = MI.getOperand(4).isKill(); |
| |
| MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); |
| MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); |
| MF->insert(It, TrueBB); |
| MF->insert(It, EndBB); |
| |
| // Transfer rest of current basic-block to EndBB |
| EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), |
| MBB->end()); |
| EndBB->transferSuccessorsAndUpdatePHIs(MBB); |
| |
| BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); |
| BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); |
| MBB->addSuccessor(TrueBB); |
| MBB->addSuccessor(EndBB); |
| |
| // TrueBB falls through to the end. |
| TrueBB->addSuccessor(EndBB); |
| |
| if (!NZCVKilled) { |
| TrueBB->addLiveIn(AArch64::NZCV); |
| EndBB->addLiveIn(AArch64::NZCV); |
| } |
| |
| BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) |
| .addReg(IfTrueReg) |
| .addMBB(TrueBB) |
| .addReg(IfFalseReg) |
| .addMBB(MBB); |
| |
| MI.eraseFromParent(); |
| return EndBB; |
| } |
| |
| MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( |
| MachineInstr &MI, MachineBasicBlock *BB) const { |
| assert(!isAsynchronousEHPersonality(classifyEHPersonality( |
| BB->getParent()->getFunction().getPersonalityFn())) && |
| "SEH does not use catchret!"); |
| return BB; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, |
| MachineInstr &MI, |
| MachineBasicBlock *BB) const { |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); |
| |
| MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); |
| MIB.add(MI.getOperand(1)); // slice index register |
| MIB.add(MI.getOperand(2)); // slice index offset |
| MIB.add(MI.getOperand(3)); // pg |
| MIB.add(MI.getOperand(4)); // base |
| MIB.add(MI.getOperand(5)); // offset |
| |
| MI.eraseFromParent(); // The pseudo is gone now. |
| return BB; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| MachineInstrBuilder MIB = |
| BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); |
| |
| MIB.addReg(AArch64::ZA, RegState::Define); |
| MIB.add(MI.getOperand(0)); // Vector select register |
| MIB.add(MI.getOperand(1)); // Vector select offset |
| MIB.add(MI.getOperand(2)); // Base |
| MIB.add(MI.getOperand(1)); // Offset, same as vector select offset |
| |
| MI.eraseFromParent(); // The pseudo is gone now. |
| return BB; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, |
| MachineInstr &MI, |
| MachineBasicBlock *BB, bool HasTile) const { |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); |
| unsigned StartIdx = 0; |
| |
| if (HasTile) { |
| MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); |
| MIB.addReg(BaseReg + MI.getOperand(0).getImm()); |
| StartIdx = 1; |
| } else |
| MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); |
| |
| for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) |
| MIB.add(MI.getOperand(I)); |
| |
| MI.eraseFromParent(); // The pseudo is gone now. |
| return BB; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| MachineInstrBuilder MIB = |
| BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M)); |
| MIB.add(MI.getOperand(0)); // Mask |
| |
| unsigned Mask = MI.getOperand(0).getImm(); |
| for (unsigned I = 0; I < 8; I++) { |
| if (Mask & (1 << I)) |
| MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine); |
| } |
| |
| MI.eraseFromParent(); // The pseudo is gone now. |
| return BB; |
| } |
| |
| MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( |
| MachineInstr &MI, MachineBasicBlock *BB) const { |
| |
| int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode()); |
| if (SMEOrigInstr != -1) { |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| uint64_t SMEMatrixType = |
| TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; |
| switch (SMEMatrixType) { |
| case (AArch64::SMEMatrixArray): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); |
| case (AArch64::SMEMatrixTileB): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); |
| case (AArch64::SMEMatrixTileH): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); |
| case (AArch64::SMEMatrixTileS): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); |
| case (AArch64::SMEMatrixTileD): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); |
| case (AArch64::SMEMatrixTileQ): |
| return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); |
| } |
| } |
| |
| switch (MI.getOpcode()) { |
| default: |
| #ifndef NDEBUG |
| MI.dump(); |
| #endif |
| llvm_unreachable("Unexpected instruction for custom inserter!"); |
| |
| case AArch64::F128CSEL: |
| return EmitF128CSEL(MI, BB); |
| case TargetOpcode::STATEPOINT: |
| // STATEPOINT is a pseudo instruction which has no implicit defs/uses |
| // while bl call instruction (where statepoint will be lowered at the end) |
| // has implicit def. This def is early-clobber as it will be set at |
| // the moment of the call and earlier than any use is read. |
| // Add this implicit dead def here as a workaround. |
| MI.addOperand(*MI.getMF(), |
| MachineOperand::CreateReg( |
| AArch64::LR, /*isDef*/ true, |
| /*isImp*/ true, /*isKill*/ false, /*isDead*/ true, |
| /*isUndef*/ false, /*isEarlyClobber*/ true)); |
| [[fallthrough]]; |
| case TargetOpcode::STACKMAP: |
| case TargetOpcode::PATCHPOINT: |
| return emitPatchPoint(MI, BB); |
| |
| case TargetOpcode::PATCHABLE_EVENT_CALL: |
| case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: |
| return BB; |
| |
| case AArch64::CATCHRET: |
| return EmitLoweredCatchRet(MI, BB); |
| case AArch64::LD1_MXIPXX_H_PSEUDO_B: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); |
| case AArch64::LD1_MXIPXX_H_PSEUDO_H: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); |
| case AArch64::LD1_MXIPXX_H_PSEUDO_S: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); |
| case AArch64::LD1_MXIPXX_H_PSEUDO_D: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); |
| case AArch64::LD1_MXIPXX_H_PSEUDO_Q: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); |
| case AArch64::LD1_MXIPXX_V_PSEUDO_B: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); |
| case AArch64::LD1_MXIPXX_V_PSEUDO_H: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); |
| case AArch64::LD1_MXIPXX_V_PSEUDO_S: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); |
| case AArch64::LD1_MXIPXX_V_PSEUDO_D: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); |
| case AArch64::LD1_MXIPXX_V_PSEUDO_Q: |
| return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); |
| case AArch64::LDR_ZA_PSEUDO: |
| return EmitFill(MI, BB); |
| case AArch64::ZERO_M_PSEUDO: |
| return EmitZero(MI, BB); |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Lowering private implementation. |
| //===----------------------------------------------------------------------===// |
| |
| //===----------------------------------------------------------------------===// |
| // Lowering Code |
| //===----------------------------------------------------------------------===// |
| |
| // Forward declarations of SVE fixed length lowering helpers |
| static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); |
| static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); |
| static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); |
| static SDValue convertFixedMaskToScalableVector(SDValue Mask, |
| SelectionDAG &DAG); |
| static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, |
| EVT VT); |
| |
| /// isZerosVector - Check whether SDNode N is a zero-filled vector. |
| static bool isZerosVector(const SDNode *N) { |
| // Look through a bit convert. |
| while (N->getOpcode() == ISD::BITCAST) |
| N = N->getOperand(0).getNode(); |
| |
| if (ISD::isConstantSplatVectorAllZeros(N)) |
| return true; |
| |
| if (N->getOpcode() != AArch64ISD::DUP) |
| return false; |
| |
| auto Opnd0 = N->getOperand(0); |
| return isNullConstant(Opnd0) || isNullFPConstant(Opnd0); |
| } |
| |
| /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 |
| /// CC |
| static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { |
| switch (CC) { |
| default: |
| llvm_unreachable("Unknown condition code!"); |
| case ISD::SETNE: |
| return AArch64CC::NE; |
| case ISD::SETEQ: |
| return AArch64CC::EQ; |
| case ISD::SETGT: |
| return AArch64CC::GT; |
| case ISD::SETGE: |
| return AArch64CC::GE; |
| case ISD::SETLT: |
| return AArch64CC::LT; |
| case ISD::SETLE: |
| return AArch64CC::LE; |
| case ISD::SETUGT: |
| return AArch64CC::HI; |
| case ISD::SETUGE: |
| return AArch64CC::HS; |
| case ISD::SETULT: |
| return AArch64CC::LO; |
| case ISD::SETULE: |
| return AArch64CC::LS; |
| } |
| } |
| |
| /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. |
| static void changeFPCCToAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2) { |
| CondCode2 = AArch64CC::AL; |
| switch (CC) { |
| default: |
| llvm_unreachable("Unknown FP condition!"); |
| case ISD::SETEQ: |
| case ISD::SETOEQ: |
| CondCode = AArch64CC::EQ; |
| break; |
| case ISD::SETGT: |
| case ISD::SETOGT: |
| CondCode = AArch64CC::GT; |
| break; |
| case ISD::SETGE: |
| case ISD::SETOGE: |
| CondCode = AArch64CC::GE; |
| break; |
| case ISD::SETOLT: |
| CondCode = AArch64CC::MI; |
| break; |
| case ISD::SETOLE: |
| CondCode = AArch64CC::LS; |
| break; |
| case ISD::SETONE: |
| CondCode = AArch64CC::MI; |
| CondCode2 = AArch64CC::GT; |
| break; |
| case ISD::SETO: |
| CondCode = AArch64CC::VC; |
| break; |
| case ISD::SETUO: |
| CondCode = AArch64CC::VS; |
| break; |
| case ISD::SETUEQ: |
| CondCode = AArch64CC::EQ; |
| CondCode2 = AArch64CC::VS; |
| break; |
| case ISD::SETUGT: |
| CondCode = AArch64CC::HI; |
| break; |
| case ISD::SETUGE: |
| CondCode = AArch64CC::PL; |
| break; |
| case ISD::SETLT: |
| case ISD::SETULT: |
| CondCode = AArch64CC::LT; |
| break; |
| case ISD::SETLE: |
| case ISD::SETULE: |
| CondCode = AArch64CC::LE; |
| break; |
| case ISD::SETNE: |
| case ISD::SETUNE: |
| CondCode = AArch64CC::NE; |
| break; |
| } |
| } |
| |
| /// Convert a DAG fp condition code to an AArch64 CC. |
| /// This differs from changeFPCCToAArch64CC in that it returns cond codes that |
| /// should be AND'ed instead of OR'ed. |
| static void changeFPCCToANDAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2) { |
| CondCode2 = AArch64CC::AL; |
| switch (CC) { |
| default: |
| changeFPCCToAArch64CC(CC, CondCode, CondCode2); |
| assert(CondCode2 == AArch64CC::AL); |
| break; |
| case ISD::SETONE: |
| // (a one b) |
| // == ((a olt b) || (a ogt b)) |
| // == ((a ord b) && (a une b)) |
| CondCode = AArch64CC::VC; |
| CondCode2 = AArch64CC::NE; |
| break; |
| case ISD::SETUEQ: |
| // (a ueq b) |
| // == ((a uno b) || (a oeq b)) |
| // == ((a ule b) && (a uge b)) |
| CondCode = AArch64CC::PL; |
| CondCode2 = AArch64CC::LE; |
| break; |
| } |
| } |
| |
| /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 |
| /// CC usable with the vector instructions. Fewer operations are available |
| /// without a real NZCV register, so we have to use less efficient combinations |
| /// to get the same effect. |
| static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2, |
| bool &Invert) { |
| Invert = false; |
| switch (CC) { |
| default: |
| // Mostly the scalar mappings work fine. |
| changeFPCCToAArch64CC(CC, CondCode, CondCode2); |
| break; |
| case ISD::SETUO: |
| Invert = true; |
| [[fallthrough]]; |
| case ISD::SETO: |
| CondCode = AArch64CC::MI; |
| CondCode2 = AArch64CC::GE; |
| break; |
| case ISD::SETUEQ: |
| case ISD::SETULT: |
| case ISD::SETULE: |
| case ISD::SETUGT: |
| case ISD::SETUGE: |
| // All of the compare-mask comparisons are ordered, but we can switch |
| // between the two by a double inversion. E.g. ULE == !OGT. |
| Invert = true; |
| changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), |
| CondCode, CondCode2); |
| break; |
| } |
| } |
| |
| static bool isLegalArithImmed(uint64_t C) { |
| // Matches AArch64DAGToDAGISel::SelectArithImmed(). |
| bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); |
| LLVM_DEBUG(dbgs() << "Is imm " << C |
| << " legal: " << (IsLegal ? "yes\n" : "no\n")); |
| return IsLegal; |
| } |
| |
| // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on |
| // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags |
| // can be set differently by this operation. It comes down to whether |
| // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then |
| // everything is fine. If not then the optimization is wrong. Thus general |
| // comparisons are only valid if op2 != 0. |
| // |
| // So, finally, the only LLVM-native comparisons that don't mention C and V |
| // are SETEQ and SETNE. They're the only ones we can safely use CMN for in |
| // the absence of information about op2. |
| static bool isCMN(SDValue Op, ISD::CondCode CC) { |
| return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && |
| (CC == ISD::SETEQ || CC == ISD::SETNE); |
| } |
| |
| static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, |
| SelectionDAG &DAG, SDValue Chain, |
| bool IsSignaling) { |
| EVT VT = LHS.getValueType(); |
| assert(VT != MVT::f128); |
| |
| const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); |
| |
| if (VT == MVT::f16 && !FullFP16) { |
| LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, |
| {Chain, LHS}); |
| RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, |
| {LHS.getValue(1), RHS}); |
| Chain = RHS.getValue(1); |
| VT = MVT::f32; |
| } |
| unsigned Opcode = |
| IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; |
| return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); |
| } |
| |
| static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
| const SDLoc &dl, SelectionDAG &DAG) { |
| EVT VT = LHS.getValueType(); |
| const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); |
| |
| if (VT.isFloatingPoint()) { |
| assert(VT != MVT::f128); |
| if (VT == MVT::f16 && !FullFP16) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); |
| VT = MVT::f32; |
| } |
| return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); |
| } |
| |
| // The CMP instruction is just an alias for SUBS, and representing it as |
| // SUBS means that it's possible to get CSE with subtract operations. |
| // A later phase can perform the optimization of setting the destination |
| // register to WZR/XZR if it ends up being unused. |
| unsigned Opcode = AArch64ISD::SUBS; |
| |
| if (isCMN(RHS, CC)) { |
| // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? |
| Opcode = AArch64ISD::ADDS; |
| RHS = RHS.getOperand(1); |
| } else if (isCMN(LHS, CC)) { |
| // As we are looking for EQ/NE compares, the operands can be commuted ; can |
| // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? |
| Opcode = AArch64ISD::ADDS; |
| LHS = LHS.getOperand(1); |
| } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { |
| if (LHS.getOpcode() == ISD::AND) { |
| // Similarly, (CMP (and X, Y), 0) can be implemented with a TST |
| // (a.k.a. ANDS) except that the flags are only guaranteed to work for one |
| // of the signed comparisons. |
| const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, |
| DAG.getVTList(VT, MVT_CC), |
| LHS.getOperand(0), |
| LHS.getOperand(1)); |
| // Replace all users of (and X, Y) with newly generated (ands X, Y) |
| DAG.ReplaceAllUsesWith(LHS, ANDSNode); |
| return ANDSNode.getValue(1); |
| } else if (LHS.getOpcode() == AArch64ISD::ANDS) { |
| // Use result of ANDS |
| return LHS.getValue(1); |
| } |
| } |
| |
| return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) |
| .getValue(1); |
| } |
| |
| /// \defgroup AArch64CCMP CMP;CCMP matching |
| /// |
| /// These functions deal with the formation of CMP;CCMP;... sequences. |
| /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of |
| /// a comparison. They set the NZCV flags to a predefined value if their |
| /// predicate is false. This allows to express arbitrary conjunctions, for |
| /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" |
| /// expressed as: |
| /// cmp A |
| /// ccmp B, inv(CB), CA |
| /// check for CB flags |
| /// |
| /// This naturally lets us implement chains of AND operations with SETCC |
| /// operands. And we can even implement some other situations by transforming |
| /// them: |
| /// - We can implement (NEG SETCC) i.e. negating a single comparison by |
| /// negating the flags used in a CCMP/FCCMP operations. |
| /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations |
| /// by negating the flags we test for afterwards. i.e. |
| /// NEG (CMP CCMP CCCMP ...) can be implemented. |
| /// - Note that we can only ever negate all previously processed results. |
| /// What we can not implement by flipping the flags to test is a negation |
| /// of two sub-trees (because the negation affects all sub-trees emitted so |
| /// far, so the 2nd sub-tree we emit would also affect the first). |
| /// With those tools we can implement some OR operations: |
| /// - (OR (SETCC A) (SETCC B)) can be implemented via: |
| /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) |
| /// - After transforming OR to NEG/AND combinations we may be able to use NEG |
| /// elimination rules from earlier to implement the whole thing as a |
| /// CCMP/FCCMP chain. |
| /// |
| /// As complete example: |
| /// or (or (setCA (cmp A)) (setCB (cmp B))) |
| /// (and (setCC (cmp C)) (setCD (cmp D)))" |
| /// can be reassociated to: |
| /// or (and (setCC (cmp C)) setCD (cmp D)) |
| // (or (setCA (cmp A)) (setCB (cmp B))) |
| /// can be transformed to: |
| /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) |
| /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" |
| /// which can be implemented as: |
| /// cmp C |
| /// ccmp D, inv(CD), CC |
| /// ccmp A, CA, inv(CD) |
| /// ccmp B, CB, inv(CA) |
| /// check for CB flags |
| /// |
| /// A counterexample is "or (and A B) (and C D)" which translates to |
| /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we |
| /// can only implement 1 of the inner (not) operations, but not both! |
| /// @{ |
| |
| /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. |
| static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, |
| ISD::CondCode CC, SDValue CCOp, |
| AArch64CC::CondCode Predicate, |
| AArch64CC::CondCode OutCC, |
| const SDLoc &DL, SelectionDAG &DAG) { |
| unsigned Opcode = 0; |
| const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); |
| |
| if (LHS.getValueType().isFloatingPoint()) { |
| assert(LHS.getValueType() != MVT::f128); |
| if (LHS.getValueType() == MVT::f16 && !FullFP16) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); |
| } |
| Opcode = AArch64ISD::FCCMP; |
| } else if (RHS.getOpcode() == ISD::SUB) { |
| SDValue SubOp0 = RHS.getOperand(0); |
| if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { |
| // See emitComparison() on why we can only do this for SETEQ and SETNE. |
| Opcode = AArch64ISD::CCMN; |
| RHS = RHS.getOperand(1); |
| } |
| } |
| if (Opcode == 0) |
| Opcode = AArch64ISD::CCMP; |
| |
| SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); |
| AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); |
| unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); |
| SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); |
| return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); |
| } |
| |
| /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be |
| /// expressed as a conjunction. See \ref AArch64CCMP. |
| /// \param CanNegate Set to true if we can negate the whole sub-tree just by |
| /// changing the conditions on the SETCC tests. |
| /// (this means we can call emitConjunctionRec() with |
| /// Negate==true on this sub-tree) |
| /// \param MustBeFirst Set to true if this subtree needs to be negated and we |
| /// cannot do the negation naturally. We are required to |
| /// emit the subtree first in this case. |
| /// \param WillNegate Is true if are called when the result of this |
| /// subexpression must be negated. This happens when the |
| /// outer expression is an OR. We can use this fact to know |
| /// that we have a double negation (or (or ...) ...) that |
| /// can be implemented for free. |
| static bool canEmitConjunction(const SDValue Val, bool &CanNegate, |
| bool &MustBeFirst, bool WillNegate, |
| unsigned Depth = 0) { |
| if (!Val.hasOneUse()) |
| return false; |
| unsigned Opcode = Val->getOpcode(); |
| if (Opcode == ISD::SETCC) { |
| if (Val->getOperand(0).getValueType() == MVT::f128) |
| return false; |
| CanNegate = true; |
| MustBeFirst = false; |
| return true; |
| } |
| // Protect against exponential runtime and stack overflow. |
| if (Depth > 6) |
| return false; |
| if (Opcode == ISD::AND || Opcode == ISD::OR) { |
| bool IsOR = Opcode == ISD::OR; |
| SDValue O0 = Val->getOperand(0); |
| SDValue O1 = Val->getOperand(1); |
| bool CanNegateL; |
| bool MustBeFirstL; |
| if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) |
| return false; |
| bool CanNegateR; |
| bool MustBeFirstR; |
| if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) |
| return false; |
| |
| if (MustBeFirstL && MustBeFirstR) |
| return false; |
| |
| if (IsOR) { |
| // For an OR expression we need to be able to naturally negate at least |
| // one side or we cannot do the transformation at all. |
| if (!CanNegateL && !CanNegateR) |
| return false; |
| // If we the result of the OR will be negated and we can naturally negate |
| // the leafs, then this sub-tree as a whole negates naturally. |
| CanNegate = WillNegate && CanNegateL && CanNegateR; |
| // If we cannot naturally negate the whole sub-tree, then this must be |
| // emitted first. |
| MustBeFirst = !CanNegate; |
| } else { |
| assert(Opcode == ISD::AND && "Must be OR or AND"); |
| // We cannot naturally negate an AND operation. |
| CanNegate = false; |
| MustBeFirst = MustBeFirstL || MustBeFirstR; |
| } |
| return true; |
| } |
| return false; |
| } |
| |
| /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain |
| /// of CCMP/CFCMP ops. See @ref AArch64CCMP. |
| /// Tries to transform the given i1 producing node @p Val to a series compare |
| /// and conditional compare operations. @returns an NZCV flags producing node |
| /// and sets @p OutCC to the flags that should be tested or returns SDValue() if |
| /// transformation was not possible. |
| /// \p Negate is true if we want this sub-tree being negated just by changing |
| /// SETCC conditions. |
| static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, |
| AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, |
| AArch64CC::CondCode Predicate) { |
| // We're at a tree leaf, produce a conditional comparison operation. |
| unsigned Opcode = Val->getOpcode(); |
| if (Opcode == ISD::SETCC) { |
| SDValue LHS = Val->getOperand(0); |
| SDValue RHS = Val->getOperand(1); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); |
| bool isInteger = LHS.getValueType().isInteger(); |
| if (Negate) |
| CC = getSetCCInverse(CC, LHS.getValueType()); |
| SDLoc DL(Val); |
| // Determine OutCC and handle FP special case. |
| if (isInteger) { |
| OutCC = changeIntCCToAArch64CC(CC); |
| } else { |
| assert(LHS.getValueType().isFloatingPoint()); |
| AArch64CC::CondCode ExtraCC; |
| changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); |
| // Some floating point conditions can't be tested with a single condition |
| // code. Construct an additional comparison in this case. |
| if (ExtraCC != AArch64CC::AL) { |
| SDValue ExtraCmp; |
| if (!CCOp.getNode()) |
| ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); |
| else |
| ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, |
| ExtraCC, DL, DAG); |
| CCOp = ExtraCmp; |
| Predicate = ExtraCC; |
| } |
| } |
| |
| // Produce a normal comparison if we are first in the chain |
| if (!CCOp) |
| return emitComparison(LHS, RHS, CC, DL, DAG); |
| // Otherwise produce a ccmp. |
| return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, |
| DAG); |
| } |
| assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); |
| |
| bool IsOR = Opcode == ISD::OR; |
| |
| SDValue LHS = Val->getOperand(0); |
| bool CanNegateL; |
| bool MustBeFirstL; |
| bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); |
| assert(ValidL && "Valid conjunction/disjunction tree"); |
| (void)ValidL; |
| |
| SDValue RHS = Val->getOperand(1); |
| bool CanNegateR; |
| bool MustBeFirstR; |
| bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); |
| assert(ValidR && "Valid conjunction/disjunction tree"); |
| (void)ValidR; |
| |
| // Swap sub-tree that must come first to the right side. |
| if (MustBeFirstL) { |
| assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); |
| std::swap(LHS, RHS); |
| std::swap(CanNegateL, CanNegateR); |
| std::swap(MustBeFirstL, MustBeFirstR); |
| } |
| |
| bool NegateR; |
| bool NegateAfterR; |
| bool NegateL; |
| bool NegateAfterAll; |
| if (Opcode == ISD::OR) { |
| // Swap the sub-tree that we can negate naturally to the left. |
| if (!CanNegateL) { |
| assert(CanNegateR && "at least one side must be negatable"); |
| assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); |
| assert(!Negate); |
| std::swap(LHS, RHS); |
| NegateR = false; |
| NegateAfterR = true; |
| } else { |
| // Negate the left sub-tree if possible, otherwise negate the result. |
| NegateR = CanNegateR; |
| NegateAfterR = !CanNegateR; |
| } |
| NegateL = true; |
| NegateAfterAll = !Negate; |
| } else { |
| assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); |
| assert(!Negate && "Valid conjunction/disjunction tree"); |
| |
| NegateL = false; |
| NegateR = false; |
| NegateAfterR = false; |
| NegateAfterAll = false; |
| } |
| |
| // Emit sub-trees. |
| AArch64CC::CondCode RHSCC; |
| SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); |
| if (NegateAfterR) |
| RHSCC = AArch64CC::getInvertedCondCode(RHSCC); |
| SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); |
| if (NegateAfterAll) |
| OutCC = AArch64CC::getInvertedCondCode(OutCC); |
| return CmpL; |
| } |
| |
| /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). |
| /// In some cases this is even possible with OR operations in the expression. |
| /// See \ref AArch64CCMP. |
| /// \see emitConjunctionRec(). |
| static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, |
| AArch64CC::CondCode &OutCC) { |
| bool DummyCanNegate; |
| bool DummyMustBeFirst; |
| if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) |
| return SDValue(); |
| |
| return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); |
| } |
| |
| /// @} |
| |
| /// Returns how profitable it is to fold a comparison's operand's shift and/or |
| /// extension operations. |
| static unsigned getCmpOperandFoldingProfit(SDValue Op) { |
| auto isSupportedExtend = [&](SDValue V) { |
| if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) |
| return true; |
| |
| if (V.getOpcode() == ISD::AND) |
| if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { |
| uint64_t Mask = MaskCst->getZExtValue(); |
| return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); |
| } |
| |
| return false; |
| }; |
| |
| if (!Op.hasOneUse()) |
| return 0; |
| |
| if (isSupportedExtend(Op)) |
| return 1; |
| |
| unsigned Opc = Op.getOpcode(); |
| if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) |
| if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { |
| uint64_t Shift = ShiftCst->getZExtValue(); |
| if (isSupportedExtend(Op.getOperand(0))) |
| return (Shift <= 4) ? 2 : 1; |
| EVT VT = Op.getValueType(); |
| if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
| SDValue &AArch64cc, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { |
| EVT VT = RHS.getValueType(); |
| uint64_t C = RHSC->getZExtValue(); |
| if (!isLegalArithImmed(C)) { |
| // Constant does not fit, try adjusting it by one? |
| switch (CC) { |
| default: |
| break; |
| case ISD::SETLT: |
| case ISD::SETGE: |
| if ((VT == MVT::i32 && C != 0x80000000 && |
| isLegalArithImmed((uint32_t)(C - 1))) || |
| (VT == MVT::i64 && C != 0x80000000ULL && |
| isLegalArithImmed(C - 1ULL))) { |
| CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; |
| C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETULT: |
| case ISD::SETUGE: |
| if ((VT == MVT::i32 && C != 0 && |
| isLegalArithImmed((uint32_t)(C - 1))) || |
| (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { |
| CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; |
| C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETLE: |
| case ISD::SETGT: |
| if ((VT == MVT::i32 && C != INT32_MAX && |
| isLegalArithImmed((uint32_t)(C + 1))) || |
| (VT == MVT::i64 && C != INT64_MAX && |
| isLegalArithImmed(C + 1ULL))) { |
| CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; |
| C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETULE: |
| case ISD::SETUGT: |
| if ((VT == MVT::i32 && C != UINT32_MAX && |
| isLegalArithImmed((uint32_t)(C + 1))) || |
| (VT == MVT::i64 && C != UINT64_MAX && |
| isLegalArithImmed(C + 1ULL))) { |
| CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; |
| C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| } |
| } |
| } |
| |
| // Comparisons are canonicalized so that the RHS operand is simpler than the |
| // LHS one, the extreme case being when RHS is an immediate. However, AArch64 |
| // can fold some shift+extend operations on the RHS operand, so swap the |
| // operands if that can be done. |
| // |
| // For example: |
| // lsl w13, w11, #1 |
| // cmp w13, w12 |
| // can be turned into: |
| // cmp w12, w11, lsl #1 |
| if (!isa<ConstantSDNode>(RHS) || |
| !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { |
| SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; |
| |
| if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { |
| std::swap(LHS, RHS); |
| CC = ISD::getSetCCSwappedOperands(CC); |
| } |
| } |
| |
| SDValue Cmp; |
| AArch64CC::CondCode AArch64CC; |
| if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { |
| const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); |
| |
| // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. |
| // For the i8 operand, the largest immediate is 255, so this can be easily |
| // encoded in the compare instruction. For the i16 operand, however, the |
| // largest immediate cannot be encoded in the compare. |
| // Therefore, use a sign extending load and cmn to avoid materializing the |
| // -1 constant. For example, |
| // movz w1, #65535 |
| // ldrh w0, [x0, #0] |
| // cmp w0, w1 |
| // > |
| // ldrsh w0, [x0, #0] |
| // cmn w0, #1 |
| // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) |
| // if and only if (sext LHS) == (sext RHS). The checks are in place to |
| // ensure both the LHS and RHS are truly zero extended and to make sure the |
| // transformation is profitable. |
| if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && |
| cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && |
| cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && |
| LHS.getNode()->hasNUsesOfValue(1, 0)) { |
| int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); |
| if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { |
| SDValue SExt = |
| DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, |
| DAG.getValueType(MVT::i16)); |
| Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, |
| RHS.getValueType()), |
| CC, dl, DAG); |
| AArch64CC = changeIntCCToAArch64CC(CC); |
| } |
| } |
| |
| if (!Cmp && (RHSC->isZero() || RHSC->isOne())) { |
| if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { |
| if ((CC == ISD::SETNE) ^ RHSC->isZero()) |
| AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); |
| } |
| } |
| } |
| |
| if (!Cmp) { |
| Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| AArch64CC = changeIntCCToAArch64CC(CC); |
| } |
| AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); |
| return Cmp; |
| } |
| |
| static std::pair<SDValue, SDValue> |
| getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { |
| assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && |
| "Unsupported value type"); |
| SDValue Value, Overflow; |
| SDLoc DL(Op); |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| unsigned Opc = 0; |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("Unknown overflow instruction!"); |
| case ISD::SADDO: |
| Opc = AArch64ISD::ADDS; |
| CC = AArch64CC::VS; |
| break; |
| case ISD::UADDO: |
| Opc = AArch64ISD::ADDS; |
| CC = AArch64CC::HS; |
| break; |
| case ISD::SSUBO: |
| Opc = AArch64ISD::SUBS; |
| CC = AArch64CC::VS; |
| break; |
| case ISD::USUBO: |
| Opc = AArch64ISD::SUBS; |
| CC = AArch64CC::LO; |
| break; |
| // Multiply needs a little bit extra work. |
| case ISD::SMULO: |
| case ISD::UMULO: { |
| CC = AArch64CC::NE; |
| bool IsSigned = Op.getOpcode() == ISD::SMULO; |
| if (Op.getValueType() == MVT::i32) { |
| // Extend to 64-bits, then perform a 64-bit multiply. |
| unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); |
| RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); |
| SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); |
| Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); |
| |
| // Check that the result fits into a 32-bit integer. |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); |
| if (IsSigned) { |
| // cmp xreg, wreg, sxtw |
| SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); |
| Overflow = |
| DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); |
| } else { |
| // tst xreg, #0xffffffff00000000 |
| SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); |
| Overflow = |
| DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); |
| } |
| break; |
| } |
| assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); |
| // For the 64 bit multiply |
| Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); |
| if (IsSigned) { |
| SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); |
| SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, |
| DAG.getConstant(63, DL, MVT::i64)); |
| // It is important that LowerBits is last, otherwise the arithmetic |
| // shift will not be folded into the compare (SUBS). |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); |
| Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) |
| .getValue(1); |
| } else { |
| SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); |
| Overflow = |
| DAG.getNode(AArch64ISD::SUBS, DL, VTs, |
| DAG.getConstant(0, DL, MVT::i64), |
| UpperBits).getValue(1); |
| } |
| break; |
| } |
| } // switch (...) |
| |
| if (Opc) { |
| SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); |
| |
| // Emit the AArch64 operation with overflow check. |
| Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); |
| Overflow = Value.getValue(1); |
| } |
| return std::make_pair(Value, Overflow); |
| } |
| |
| SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { |
| if (useSVEForFixedLengthVectorVT(Op.getValueType(), |
| !Subtarget->isNeonAvailable())) |
| return LowerToScalableOp(Op, DAG); |
| |
| SDValue Sel = Op.getOperand(0); |
| SDValue Other = Op.getOperand(1); |
| SDLoc dl(Sel); |
| |
| // If the operand is an overflow checking operation, invert the condition |
| // code and kill the Not operation. I.e., transform: |
| // (xor (overflow_op_bool, 1)) |
| // --> |
| // (csel 1, 0, invert(cc), overflow_op_bool) |
| // ... which later gets transformed to just a cset instruction with an |
| // inverted condition code, rather than a cset + eor sequence. |
| if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) |
| return SDValue(); |
| |
| SDValue TVal = DAG.getConstant(1, dl, MVT::i32); |
| SDValue FVal = DAG.getConstant(0, dl, MVT::i32); |
| AArch64CC::CondCode CC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); |
| SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, |
| CCVal, Overflow); |
| } |
| // If neither operand is a SELECT_CC, give up. |
| if (Sel.getOpcode() != ISD::SELECT_CC) |
| std::swap(Sel, Other); |
| if (Sel.getOpcode() != ISD::SELECT_CC) |
| return Op; |
| |
| // The folding we want to perform is: |
| // (xor x, (select_cc a, b, cc, 0, -1) ) |
| // --> |
| // (csel x, (xor x, -1), cc ...) |
| // |
| // The latter will get matched to a CSINV instruction. |
| |
| ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); |
| SDValue LHS = Sel.getOperand(0); |
| SDValue RHS = Sel.getOperand(1); |
| SDValue TVal = Sel.getOperand(2); |
| SDValue FVal = Sel.getOperand(3); |
| |
| // FIXME: This could be generalized to non-integer comparisons. |
| if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) |
| return Op; |
| |
| ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); |
| ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); |
| |
| // The values aren't constants, this isn't the pattern we're looking for. |
| if (!CFVal || !CTVal) |
| return Op; |
| |
| // We can commute the SELECT_CC by inverting the condition. This |
| // might be needed to make this fit into a CSINV pattern. |
| if (CTVal->isAllOnes() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| |
| // If the constants line up, perform the transform! |
| if (CTVal->isZero() && CFVal->isAllOnes()) { |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| |
| FVal = Other; |
| TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, |
| DAG.getConstant(-1ULL, dl, Other.getValueType())); |
| |
| return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, |
| CCVal, Cmp); |
| } |
| |
| return Op; |
| } |
| |
| // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C' |
| // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else |
| // sets 'C' bit to 0. |
| static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { |
| SDLoc DL(Value); |
| EVT VT = Value.getValueType(); |
| SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; |
| SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); |
| SDValue Cmp = |
| DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); |
| return Cmp.getValue(1); |
| } |
| |
| // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0. |
| // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1. |
| static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, |
| bool Invert) { |
| assert(Glue.getResNo() == 1); |
| SDLoc DL(Glue); |
| SDValue Zero = DAG.getConstant(0, DL, VT); |
| SDValue One = DAG.getConstant(1, DL, VT); |
| unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; |
| SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); |
| } |
| |
| // Value is 1 if 'V' bit of NZCV is 1, else 0 |
| static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) { |
| assert(Glue.getResNo() == 1); |
| SDLoc DL(Glue); |
| SDValue Zero = DAG.getConstant(0, DL, VT); |
| SDValue One = DAG.getConstant(1, DL, VT); |
| SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); |
| } |
| |
| // This lowering is inefficient, but it will get cleaned up by |
| // `foldOverflowCheck` |
| static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, |
| unsigned Opcode, bool IsSigned) { |
| EVT VT0 = Op.getValue(0).getValueType(); |
| EVT VT1 = Op.getValue(1).getValueType(); |
| |
| if (VT0 != MVT::i32 && VT0 != MVT::i64) |
| return SDValue(); |
| |
| bool InvertCarry = Opcode == AArch64ISD::SBCS; |
| SDValue OpLHS = Op.getOperand(0); |
| SDValue OpRHS = Op.getOperand(1); |
| SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); |
| |
| SDLoc DL(Op); |
| SDVTList VTs = DAG.getVTList(VT0, VT1); |
| |
| SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, |
| OpRHS, OpCarryIn); |
| |
| SDValue OutFlag = |
| IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) |
| : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); |
| |
| return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); |
| } |
| |
| static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { |
| // Let legalize expand this if it isn't a legal type yet. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) |
| return SDValue(); |
| |
| SDLoc dl(Op); |
| AArch64CC::CondCode CC; |
| // The actual operation that sets the overflow or carry flag. |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); |
| |
| // We use 0 and 1 as false and true values. |
| SDValue TVal = DAG.getConstant(1, dl, MVT::i32); |
| SDValue FVal = DAG.getConstant(0, dl, MVT::i32); |
| |
| // We use an inverted condition, because the conditional select is inverted |
| // too. This will allow it to be selected to a single instruction: |
| // CSINC Wd, WZR, WZR, invert(cond). |
| SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); |
| Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, |
| CCVal, Overflow); |
| |
| SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); |
| return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); |
| } |
| |
| // Prefetch operands are: |
| // 1: Address to prefetch |
| // 2: bool isWrite |
| // 3: int locality (0 = no locality ... 3 = extreme locality) |
| // 4: bool isDataCache |
| static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); |
| unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); |
| unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); |
| |
| bool IsStream = !Locality; |
| // When the locality number is set |
| if (Locality) { |
| // The front-end should have filtered out the out-of-range values |
| assert(Locality <= 3 && "Prefetch locality out-of-range"); |
| // The locality degree is the opposite of the cache speed. |
| // Put the number the other way around. |
| // The encoding starts at 0 for level 1 |
| Locality = 3 - Locality; |
| } |
| |
| // built the mask value encoding the expected behavior. |
| unsigned PrfOp = (IsWrite << 4) | // Load/Store bit |
| (!IsData << 3) | // IsDataCache bit |
| (Locality << 1) | // Cache level bits |
| (unsigned)IsStream; // Stream bit |
| return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), |
| DAG.getTargetConstant(PrfOp, DL, MVT::i32), |
| Op.getOperand(1)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| if (VT.isScalableVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); |
| |
| if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthFPExtendToSVE(Op, DAG); |
| |
| assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (Op.getValueType().isScalableVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| EVT SrcVT = SrcVal.getValueType(); |
| |
| if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthFPRoundToSVE(Op, DAG); |
| |
| if (SrcVT != MVT::f128) { |
| // Expand cases where the input is a vector bigger than NEON. |
| if (useSVEForFixedLengthVectorVT(SrcVT)) |
| return SDValue(); |
| |
| // It's legal except when f128 is involved |
| return Op; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. |
| // Any additional optimization in this function should be recorded |
| // in the cost tables. |
| bool IsStrict = Op->isStrictFPOpcode(); |
| EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType(); |
| EVT VT = Op.getValueType(); |
| |
| if (VT.isScalableVector()) { |
| unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT |
| ? AArch64ISD::FCVTZU_MERGE_PASSTHRU |
| : AArch64ISD::FCVTZS_MERGE_PASSTHRU; |
| return LowerToPredicatedOp(Op, DAG, Opcode); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || |
| useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthFPToIntToSVE(Op, DAG); |
| |
| unsigned NumElts = InVT.getVectorNumElements(); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (InVT.getVectorElementType() == MVT::f16 && |
| !Subtarget->hasFullFP16()) { |
| MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); |
| SDLoc dl(Op); |
| if (IsStrict) { |
| SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, |
| {Op.getOperand(0), Op.getOperand(1)}); |
| return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, |
| {Ext.getValue(1), Ext.getValue(0)}); |
| } |
| return DAG.getNode( |
| Op.getOpcode(), dl, Op.getValueType(), |
| DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); |
| } |
| |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| uint64_t InVTSize = InVT.getFixedSizeInBits(); |
| if (VTSize < InVTSize) { |
| SDLoc dl(Op); |
| if (IsStrict) { |
| InVT = InVT.changeVectorElementTypeToInteger(); |
| SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other}, |
| {Op.getOperand(0), Op.getOperand(1)}); |
| SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); |
| return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl); |
| } |
| SDValue Cv = |
| DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), |
| Op.getOperand(0)); |
| return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); |
| } |
| |
| if (VTSize > InVTSize) { |
| SDLoc dl(Op); |
| MVT ExtVT = |
| MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), |
| VT.getVectorNumElements()); |
| if (IsStrict) { |
| SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other}, |
| {Op.getOperand(0), Op.getOperand(1)}); |
| return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, |
| {Ext.getValue(1), Ext.getValue(0)}); |
| } |
| SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); |
| return DAG.getNode(Op.getOpcode(), dl, VT, Ext); |
| } |
| |
| // Use a scalar operation for conversions between single-element vectors of |
| // the same size. |
| if (NumElts == 1) { |
| SDLoc dl(Op); |
| SDValue Extract = DAG.getNode( |
| ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), |
| Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64)); |
| EVT ScalarVT = VT.getScalarType(); |
| if (IsStrict) |
| return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, |
| {Op.getOperand(0), Extract}); |
| return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); |
| } |
| |
| // Type changing conversions are illegal. |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, |
| SelectionDAG &DAG) const { |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| |
| if (SrcVal.getValueType().isVector()) |
| return LowerVectorFP_TO_INT(Op, DAG); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { |
| SDLoc dl(Op); |
| if (IsStrict) { |
| SDValue Ext = |
| DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, |
| {Op.getOperand(0), SrcVal}); |
| return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other}, |
| {Ext.getValue(1), Ext.getValue(0)}); |
| } |
| return DAG.getNode( |
| Op.getOpcode(), dl, Op.getValueType(), |
| DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); |
| } |
| |
| if (SrcVal.getValueType() != MVT::f128) { |
| // It's legal except when f128 is involved |
| return Op; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AArch64 FP-to-int conversions saturate to the destination element size, so |
| // we can lower common saturating conversions to simple instructions. |
| SDValue SrcVal = Op.getOperand(0); |
| EVT SrcVT = SrcVal.getValueType(); |
| EVT DstVT = Op.getValueType(); |
| EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| |
| uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits(); |
| uint64_t DstElementWidth = DstVT.getScalarSizeInBits(); |
| uint64_t SatWidth = SatVT.getScalarSizeInBits(); |
| assert(SatWidth <= DstElementWidth && |
| "Saturation width cannot exceed result width"); |
| |
| // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. |
| // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable |
| // types, so this is hard to reach. |
| if (DstVT.isScalableVector()) |
| return SDValue(); |
| |
| EVT SrcElementVT = SrcVT.getVectorElementType(); |
| |
| // In the absence of FP16 support, promote f16 to f32 and saturate the result. |
| if (SrcElementVT == MVT::f16 && |
| (!Subtarget->hasFullFP16() || DstElementWidth > 16)) { |
| MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements()); |
| SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal); |
| SrcVT = F32VT; |
| SrcElementVT = MVT::f32; |
| SrcElementWidth = 32; |
| } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 && |
| SrcElementVT != MVT::f16) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| // Cases that we can emit directly. |
| if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) |
| return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, |
| DAG.getValueType(DstVT.getScalarType())); |
| |
| // Otherwise we emit a cvt that saturates to a higher BW, and saturate the |
| // result. This is only valid if the legal cvt is larger than the saturate |
| // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize |
| // (at least until sqxtn is selected). |
| if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64) |
| return SDValue(); |
| |
| EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); |
| SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal, |
| DAG.getValueType(IntVT.getScalarType())); |
| SDValue Sat; |
| if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { |
| SDValue MinC = DAG.getConstant( |
| APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT); |
| SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); |
| SDValue MaxC = DAG.getConstant( |
| APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT); |
| Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); |
| } else { |
| SDValue MinC = DAG.getConstant( |
| APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT); |
| Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); |
| } |
| |
| return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AArch64 FP-to-int conversions saturate to the destination register size, so |
| // we can lower common saturating conversions to simple instructions. |
| SDValue SrcVal = Op.getOperand(0); |
| EVT SrcVT = SrcVal.getValueType(); |
| |
| if (SrcVT.isVector()) |
| return LowerVectorFP_TO_INT_SAT(Op, DAG); |
| |
| EVT DstVT = Op.getValueType(); |
| EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| uint64_t SatWidth = SatVT.getScalarSizeInBits(); |
| uint64_t DstWidth = DstVT.getScalarSizeInBits(); |
| assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); |
| |
| // In the absence of FP16 support, promote f16 to f32 and saturate the result. |
| if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) { |
| SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal); |
| SrcVT = MVT::f32; |
| } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| // Cases that we can emit directly. |
| if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || |
| (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && |
| DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32)) |
| return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, |
| DAG.getValueType(DstVT)); |
| |
| // Otherwise we emit a cvt that saturates to a higher BW, and saturate the |
| // result. This is only valid if the legal cvt is larger than the saturate |
| // width. |
| if (DstWidth < SatWidth) |
| return SDValue(); |
| |
| SDValue NativeCvt = |
| DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); |
| SDValue Sat; |
| if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { |
| SDValue MinC = DAG.getConstant( |
| APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT); |
| SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); |
| SDValue MaxC = DAG.getConstant( |
| APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT); |
| Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); |
| } else { |
| SDValue MinC = DAG.getConstant( |
| APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT); |
| Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); |
| } |
| |
| return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. |
| // Any additional optimization in this function should be recorded |
| // in the cost tables. |
| bool IsStrict = Op->isStrictFPOpcode(); |
| EVT VT = Op.getValueType(); |
| SDLoc dl(Op); |
| SDValue In = Op.getOperand(IsStrict ? 1 : 0); |
| EVT InVT = In.getValueType(); |
| unsigned Opc = Op.getOpcode(); |
| bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; |
| |
| if (VT.isScalableVector()) { |
| if (InVT.getVectorElementType() == MVT::i1) { |
| // We can't directly extend an SVE predicate; extend it first. |
| unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| EVT CastVT = getPromotedVTForPredicate(InVT); |
| In = DAG.getNode(CastOpc, dl, CastVT, In); |
| return DAG.getNode(Opc, dl, VT, In); |
| } |
| |
| unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU |
| : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; |
| return LowerToPredicatedOp(Op, DAG, Opcode); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || |
| useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthIntToFPToSVE(Op, DAG); |
| |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| uint64_t InVTSize = InVT.getFixedSizeInBits(); |
| if (VTSize < InVTSize) { |
| MVT CastVT = |
| MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), |
| InVT.getVectorNumElements()); |
| if (IsStrict) { |
| In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, |
| {Op.getOperand(0), In}); |
| return DAG.getNode( |
| ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, |
| {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)}); |
| } |
| In = DAG.getNode(Opc, dl, CastVT, In); |
| return DAG.getNode(ISD::FP_ROUND, dl, VT, In, |
| DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); |
| } |
| |
| if (VTSize > InVTSize) { |
| unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| EVT CastVT = VT.changeVectorElementTypeToInteger(); |
| In = DAG.getNode(CastOpc, dl, CastVT, In); |
| if (IsStrict) |
| return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In}); |
| return DAG.getNode(Opc, dl, VT, In); |
| } |
| |
| // Use a scalar operation for conversions between single-element vectors of |
| // the same size. |
| if (VT.getVectorNumElements() == 1) { |
| SDValue Extract = DAG.getNode( |
| ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), |
| In, DAG.getConstant(0, dl, MVT::i64)); |
| EVT ScalarVT = VT.getScalarType(); |
| if (IsStrict) |
| return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, |
| {Op.getOperand(0), Extract}); |
| return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); |
| } |
| |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (Op.getValueType().isVector()) |
| return LowerVectorINT_TO_FP(Op, DAG); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { |
| SDLoc dl(Op); |
| if (IsStrict) { |
| SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other}, |
| {Op.getOperand(0), SrcVal}); |
| return DAG.getNode( |
| ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other}, |
| {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)}); |
| } |
| return DAG.getNode( |
| ISD::FP_ROUND, dl, MVT::f16, |
| DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), |
| DAG.getIntPtrConstant(0, dl)); |
| } |
| |
| // i128 conversions are libcalls. |
| if (SrcVal.getValueType() == MVT::i128) |
| return SDValue(); |
| |
| // Other conversions are legal, unless it's to the completely software-based |
| // fp128. |
| if (Op.getValueType() != MVT::f128) |
| return Op; |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, |
| SelectionDAG &DAG) const { |
| // For iOS, we want to call an alternative entry point: __sincos_stret, |
| // which returns the values in two S / D registers. |
| SDLoc dl(Op); |
| SDValue Arg = Op.getOperand(0); |
| EVT ArgVT = Arg.getValueType(); |
| Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); |
| |
| ArgListTy Args; |
| ArgListEntry Entry; |
| |
| Entry.Node = Arg; |
| Entry.Ty = ArgTy; |
| Entry.IsSExt = false; |
| Entry.IsZExt = false; |
| Args.push_back(Entry); |
| |
| RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 |
| : RTLIB::SINCOS_STRET_F32; |
| const char *LibcallName = getLibcallName(LC); |
| SDValue Callee = |
| DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); |
| |
| StructType *RetTy = StructType::get(ArgTy, ArgTy); |
| TargetLowering::CallLoweringInfo CLI(DAG); |
| CLI.setDebugLoc(dl) |
| .setChain(DAG.getEntryNode()) |
| .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); |
| |
| std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); |
| return CallResult.first; |
| } |
| |
| static MVT getSVEContainerType(EVT ContentTy); |
| |
| SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT OpVT = Op.getValueType(); |
| EVT ArgVT = Op.getOperand(0).getValueType(); |
| |
| if (useSVEForFixedLengthVectorVT(OpVT)) |
| return LowerFixedLengthBitcastToSVE(Op, DAG); |
| |
| if (OpVT.isScalableVector()) { |
| // Bitcasting between unpacked vector types of different element counts is |
| // not a NOP because the live elements are laid out differently. |
| // 01234567 |
| // e.g. nxv2i32 = XX??XX?? |
| // nxv4f16 = X?X?X?X? |
| if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) |
| return SDValue(); |
| |
| if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { |
| assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && |
| "Expected int->fp bitcast!"); |
| SDValue ExtResult = |
| DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), |
| Op.getOperand(0)); |
| return getSVESafeBitCast(OpVT, ExtResult, DAG); |
| } |
| return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); |
| } |
| |
| if (OpVT != MVT::f16 && OpVT != MVT::bf16) |
| return SDValue(); |
| |
| // Bitcasts between f16 and bf16 are legal. |
| if (ArgVT == MVT::f16 || ArgVT == MVT::bf16) |
| return Op; |
| |
| assert(ArgVT == MVT::i16); |
| SDLoc DL(Op); |
| |
| Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); |
| Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); |
| return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op); |
| } |
| |
| static EVT getExtensionTo64Bits(const EVT &OrigVT) { |
| if (OrigVT.getSizeInBits() >= 64) |
| return OrigVT; |
| |
| assert(OrigVT.isSimple() && "Expecting a simple value type"); |
| |
| MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; |
| switch (OrigSimpleTy) { |
| default: llvm_unreachable("Unexpected Vector Type"); |
| case MVT::v2i8: |
| case MVT::v2i16: |
| return MVT::v2i32; |
| case MVT::v4i8: |
| return MVT::v4i16; |
| } |
| } |
| |
| static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, |
| const EVT &OrigTy, |
| const EVT &ExtTy, |
| unsigned ExtOpcode) { |
| // The vector originally had a size of OrigTy. It was then extended to ExtTy. |
| // We expect the ExtTy to be 128-bits total. If the OrigTy is less than |
| // 64-bits we need to insert a new extension so that it will be 64-bits. |
| assert(ExtTy.is128BitVector() && "Unexpected extension size"); |
| if (OrigTy.getSizeInBits() >= 64) |
| return N; |
| |
| // Must extend size to at least 64 bits to be used as an operand for VMULL. |
| EVT NewVT = getExtensionTo64Bits(OrigTy); |
| |
| return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); |
| } |
| |
| // Returns lane if Op extracts from a two-element vector and lane is constant |
| // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise. |
| static std::optional<uint64_t> |
| getConstantLaneNumOfExtractHalfOperand(SDValue &Op) { |
| SDNode *OpNode = Op.getNode(); |
| if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| return std::nullopt; |
| |
| EVT VT = OpNode->getOperand(0).getValueType(); |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1)); |
| if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C) |
| return std::nullopt; |
| |
| return C->getZExtValue(); |
| } |
| |
| static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, |
| bool isSigned) { |
| EVT VT = N.getValueType(); |
| |
| if (N.getOpcode() != ISD::BUILD_VECTOR) |
| return false; |
| |
| for (const SDValue &Elt : N->op_values()) { |
| if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { |
| unsigned EltSize = VT.getScalarSizeInBits(); |
| unsigned HalfSize = EltSize / 2; |
| if (isSigned) { |
| if (!isIntN(HalfSize, C->getSExtValue())) |
| return false; |
| } else { |
| if (!isUIntN(HalfSize, C->getZExtValue())) |
| return false; |
| } |
| continue; |
| } |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) { |
| EVT VT = N.getValueType(); |
| assert(VT.is128BitVector() && "Unexpected vector MULL size"); |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned OrigEltSize = VT.getScalarSizeInBits(); |
| unsigned EltSize = OrigEltSize / 2; |
| MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); |
| |
| APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize); |
| if (DAG.MaskedValueIsZero(N, HiBits)) |
| return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N); |
| |
| if (ISD::isExtOpcode(N.getOpcode())) |
| return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG, |
| N.getOperand(0).getValueType(), VT, |
| N.getOpcode()); |
| |
| assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); |
| SDLoc dl(N); |
| SmallVector<SDValue, 8> Ops; |
| for (unsigned i = 0; i != NumElts; ++i) { |
| ConstantSDNode *C = cast<ConstantSDNode>(N.getOperand(i)); |
| const APInt &CInt = C->getAPIntValue(); |
| // Element types smaller than 32 bits are not legal, so use i32 elements. |
| // The values are implicitly truncated so sext vs. zext doesn't matter. |
| Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); |
| } |
| return DAG.getBuildVector(TruncVT, dl, Ops); |
| } |
| |
| static bool isSignExtended(SDValue N, SelectionDAG &DAG) { |
| return N.getOpcode() == ISD::SIGN_EXTEND || |
| N.getOpcode() == ISD::ANY_EXTEND || |
| isExtendedBUILD_VECTOR(N, DAG, true); |
| } |
| |
| static bool isZeroExtended(SDValue N, SelectionDAG &DAG) { |
| return N.getOpcode() == ISD::ZERO_EXTEND || |
| N.getOpcode() == ISD::ANY_EXTEND || |
| isExtendedBUILD_VECTOR(N, DAG, false); |
| } |
| |
| static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) { |
| unsigned Opcode = N.getOpcode(); |
| if (Opcode == ISD::ADD || Opcode == ISD::SUB) { |
| SDValue N0 = N.getOperand(0); |
| SDValue N1 = N.getOperand(1); |
| return N0->hasOneUse() && N1->hasOneUse() && |
| isSignExtended(N0, DAG) && isSignExtended(N1, DAG); |
| } |
| return false; |
| } |
| |
| static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) { |
| unsigned Opcode = N.getOpcode(); |
| if (Opcode == ISD::ADD || Opcode == ISD::SUB) { |
| SDValue N0 = N.getOperand(0); |
| SDValue N1 = N.getOperand(1); |
| return N0->hasOneUse() && N1->hasOneUse() && |
| isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); |
| } |
| return false; |
| } |
| |
| SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, |
| SelectionDAG &DAG) const { |
| // The rounding mode is in bits 23:22 of the FPSCR. |
| // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 |
| // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) |
| // so that the shift + and get folded into a bitfield extract. |
| SDLoc dl(Op); |
| |
| SDValue Chain = Op.getOperand(0); |
| SDValue FPCR_64 = DAG.getNode( |
| ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, |
| {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); |
| Chain = FPCR_64.getValue(1); |
| SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); |
| SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, |
| DAG.getConstant(1U << 22, dl, MVT::i32)); |
| SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, |
| DAG.getConstant(22, dl, MVT::i32)); |
| SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, |
| DAG.getConstant(3, dl, MVT::i32)); |
| return DAG.getMergeValues({AND, Chain}, dl); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| SDValue Chain = Op->getOperand(0); |
| SDValue RMValue = Op->getOperand(1); |
| |
| // The rounding mode is in bits 23:22 of the FPCR. |
| // The llvm.set.rounding argument value to the rounding mode in FPCR mapping |
| // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is |
| // ((arg - 1) & 3) << 22). |
| // |
| // The argument of llvm.set.rounding must be within the segment [0, 3], so |
| // NearestTiesToAway (4) is not handled here. It is responsibility of the code |
| // generated llvm.set.rounding to ensure this condition. |
| |
| // Calculate new value of FPCR[23:22]. |
| RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, |
| DAG.getConstant(1, DL, MVT::i32)); |
| RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, |
| DAG.getConstant(0x3, DL, MVT::i32)); |
| RMValue = |
| DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, |
| DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); |
| RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); |
| |
| // Get current value of FPCR. |
| SDValue Ops[] = { |
| Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; |
| SDValue FPCR = |
| DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); |
| Chain = FPCR.getValue(1); |
| FPCR = FPCR.getValue(0); |
| |
| // Put new rounding mode into FPSCR[23:22]. |
| const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); |
| FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, |
| DAG.getConstant(RMMask, DL, MVT::i64)); |
| FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); |
| SDValue Ops2[] = { |
| Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), |
| FPCR}; |
| return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); |
| } |
| |
| static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, |
| SDLoc DL, bool &IsMLA) { |
| bool IsN0SExt = isSignExtended(N0, DAG); |
| bool IsN1SExt = isSignExtended(N1, DAG); |
| if (IsN0SExt && IsN1SExt) |
| return AArch64ISD::SMULL; |
| |
| bool IsN0ZExt = isZeroExtended(N0, DAG); |
| bool IsN1ZExt = isZeroExtended(N1, DAG); |
| |
| if (IsN0ZExt && IsN1ZExt) |
| return AArch64ISD::UMULL; |
| |
| // Select SMULL if we can replace zext with sext. |
| if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) && |
| !isExtendedBUILD_VECTOR(N0, DAG, false) && |
| !isExtendedBUILD_VECTOR(N1, DAG, false)) { |
| SDValue ZextOperand; |
| if (IsN0ZExt) |
| ZextOperand = N0.getOperand(0); |
| else |
| ZextOperand = N1.getOperand(0); |
| if (DAG.SignBitIsZero(ZextOperand)) { |
| SDValue NewSext = |
| DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType()); |
| if (IsN0ZExt) |
| N0 = NewSext; |
| else |
| N1 = NewSext; |
| return AArch64ISD::SMULL; |
| } |
| } |
| |
| // Select UMULL if we can replace the other operand with an extend. |
| if (IsN0ZExt || IsN1ZExt) { |
| EVT VT = N0.getValueType(); |
| APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), |
| VT.getScalarSizeInBits() / 2); |
| if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask)) |
| return AArch64ISD::UMULL; |
| } |
| |
| if (!IsN1SExt && !IsN1ZExt) |
| return 0; |
| |
| // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these |
| // into (s/zext A * s/zext C) + (s/zext B * s/zext C) |
| if (IsN1SExt && isAddSubSExt(N0, DAG)) { |
| IsMLA = true; |
| return AArch64ISD::SMULL; |
| } |
| if (IsN1ZExt && isAddSubZExt(N0, DAG)) { |
| IsMLA = true; |
| return AArch64ISD::UMULL; |
| } |
| if (IsN0ZExt && isAddSubZExt(N1, DAG)) { |
| std::swap(N0, N1); |
| IsMLA = true; |
| return AArch64ISD::UMULL; |
| } |
| return 0; |
| } |
| |
| SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| bool OverrideNEON = !Subtarget->isNeonAvailable(); |
| if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); |
| |
| // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so |
| // that VMULL can be detected. Otherwise v2i64 multiplications are not legal. |
| assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() && |
| "unexpected type for custom-lowering ISD::MUL"); |
| SDValue N0 = Op.getOperand(0); |
| SDValue N1 = Op.getOperand(1); |
| bool isMLA = false; |
| EVT OVT = VT; |
| if (VT.is64BitVector()) { |
| if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| isNullConstant(N0.getOperand(1)) && |
| N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| isNullConstant(N1.getOperand(1))) { |
| N0 = N0.getOperand(0); |
| N1 = N1.getOperand(0); |
| VT = N0.getValueType(); |
| } else { |
| if (VT == MVT::v1i64) { |
| if (Subtarget->hasSVE()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); |
| // Fall through to expand this. It is not legal. |
| return SDValue(); |
| } else |
| // Other vector multiplications are legal. |
| return Op; |
| } |
| } |
| |
| SDLoc DL(Op); |
| unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA); |
| |
| if (!NewOpc) { |
| if (VT.getVectorElementType() == MVT::i64) { |
| // If SVE is available then i64 vector multiplications can also be made |
| // legal. |
| if (Subtarget->hasSVE()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); |
| // Fall through to expand this. It is not legal. |
| return SDValue(); |
| } else |
| // Other vector multiplications are legal. |
| return Op; |
| } |
| |
| // Legalize to a S/UMULL instruction |
| SDValue Op0; |
| SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); |
| if (!isMLA) { |
| Op0 = skipExtensionForVectorMULL(N0, DAG); |
| assert(Op0.getValueType().is64BitVector() && |
| Op1.getValueType().is64BitVector() && |
| "unexpected types for extended operands to VMULL"); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT, |
| DAG.getNode(NewOpc, DL, VT, Op0, Op1), |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during |
| // isel lowering to take advantage of no-stall back to back s/umul + s/umla. |
| // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 |
| SDValue N00 = skipExtensionForVectorMULL(N0.getOperand(0), DAG); |
| SDValue N01 = skipExtensionForVectorMULL(N0.getOperand(1), DAG); |
| EVT Op1VT = Op1.getValueType(); |
| return DAG.getNode( |
| ISD::EXTRACT_SUBVECTOR, DL, OVT, |
| DAG.getNode(N0.getOpcode(), DL, VT, |
| DAG.getNode(NewOpc, DL, VT, |
| DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), |
| DAG.getNode(NewOpc, DL, VT, |
| DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)), |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| |
| static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, |
| int Pattern) { |
| if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) |
| return DAG.getConstant(1, DL, MVT::nxv1i1); |
| return DAG.getNode(AArch64ISD::PTRUE, DL, VT, |
| DAG.getTargetConstant(Pattern, DL, MVT::i32)); |
| } |
| |
| static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, |
| bool IsLess, bool IsEqual) { |
| if (!isa<ConstantSDNode>(Op.getOperand(1)) || |
| !isa<ConstantSDNode>(Op.getOperand(2))) |
| return SDValue(); |
| |
| SDLoc dl(Op); |
| APInt X = Op.getConstantOperandAPInt(1); |
| APInt Y = Op.getConstantOperandAPInt(2); |
| APInt NumActiveElems; |
| bool Overflow; |
| if (IsLess) |
| NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow); |
| else |
| NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow); |
| |
| if (Overflow) |
| return SDValue(); |
| |
| if (IsEqual) { |
| APInt One(NumActiveElems.getBitWidth(), 1, IsSigned); |
| NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow) |
| : NumActiveElems.uadd_ov(One, Overflow); |
| if (Overflow) |
| return SDValue(); |
| } |
| |
| std::optional<unsigned> PredPattern = |
| getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue()); |
| unsigned MinSVEVectorSize = std::max( |
| DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u); |
| unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements(); |
| if (PredPattern != std::nullopt && |
| NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize)) |
| return getPTrue(DAG, dl, Op.getValueType(), *PredPattern); |
| |
| return SDValue(); |
| } |
| |
| // Returns a safe bitcast between two scalable vector predicates, where |
| // any newly created lanes from a widening bitcast are defined as zero. |
| static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT InVT = Op.getValueType(); |
| |
| assert(InVT.getVectorElementType() == MVT::i1 && |
| VT.getVectorElementType() == MVT::i1 && |
| "Expected a predicate-to-predicate bitcast"); |
| assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && |
| InVT.isScalableVector() && |
| DAG.getTargetLoweringInfo().isTypeLegal(InVT) && |
| "Only expect to cast between legal scalable predicate types!"); |
| |
| // Return the operand if the cast isn't changing type, |
| // e.g. <n x 16 x i1> -> <n x 16 x i1> |
| if (InVT == VT) |
| return Op; |
| |
| SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); |
| |
| // We only have to zero the lanes if new lanes are being defined, e.g. when |
| // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the |
| // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then |
| // we can return here. |
| if (InVT.bitsGT(VT)) |
| return Reinterpret; |
| |
| // Check if the other lanes are already known to be zeroed by |
| // construction. |
| if (isZeroingInactiveLanes(Op)) |
| return Reinterpret; |
| |
| // Zero the newly introduced lanes. |
| SDValue Mask = DAG.getConstant(1, DL, InVT); |
| Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); |
| return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); |
| } |
| |
| SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, |
| SMEAttrs Attrs, SDLoc DL, |
| EVT VT) const { |
| if (Attrs.hasStreamingInterfaceOrBody()) |
| return DAG.getConstant(1, DL, VT); |
| |
| if (Attrs.hasNonStreamingInterfaceAndBody()) |
| return DAG.getConstant(0, DL, VT); |
| |
| assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface"); |
| |
| SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", |
| getPointerTy(DAG.getDataLayout())); |
| Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); |
| Type *RetTy = StructType::get(Int64Ty, Int64Ty); |
| TargetLowering::CallLoweringInfo CLI(DAG); |
| ArgListTy Args; |
| CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( |
| CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, |
| RetTy, Callee, std::move(Args)); |
| std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); |
| SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); |
| return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), |
| Mask); |
| } |
| |
| SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, |
| SelectionDAG &DAG) const { |
| unsigned IntNo = Op.getConstantOperandVal(1); |
| SDLoc DL(Op); |
| switch (IntNo) { |
| default: |
| return SDValue(); // Don't custom lower most intrinsics. |
| case Intrinsic::aarch64_prefetch: { |
| SDValue Chain = Op.getOperand(0); |
| SDValue Addr = Op.getOperand(2); |
| |
| unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); |
| unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); |
| unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); |
| unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); |
| unsigned PrfOp = (IsWrite << 4) | // Load/Store bit |
| (!IsData << 3) | // IsDataCache bit |
| (Locality << 1) | // Cache level bits |
| (unsigned)IsStream; // Stream bit |
| |
| return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, |
| DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); |
| } |
| case Intrinsic::aarch64_sme_za_enable: |
| return DAG.getNode( |
| AArch64ISD::SMSTART, DL, MVT::Other, |
| Op->getOperand(0), // Chain |
| DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), |
| DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); |
| case Intrinsic::aarch64_sme_za_disable: |
| return DAG.getNode( |
| AArch64ISD::SMSTOP, DL, MVT::Other, |
| Op->getOperand(0), // Chain |
| DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), |
| DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, |
| SelectionDAG &DAG) const { |
| unsigned IntNo = Op.getConstantOperandVal(1); |
| SDLoc DL(Op); |
| switch (IntNo) { |
| default: |
| return SDValue(); // Don't custom lower most intrinsics. |
| case Intrinsic::aarch64_mops_memset_tag: { |
| auto Node = cast<MemIntrinsicSDNode>(Op.getNode()); |
| SDValue Chain = Node->getChain(); |
| SDValue Dst = Op.getOperand(2); |
| SDValue Val = Op.getOperand(3); |
| Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64); |
| SDValue Size = Op.getOperand(4); |
| auto Alignment = Node->getMemOperand()->getAlign(); |
| bool IsVol = Node->isVolatile(); |
| auto DstPtrInfo = Node->getPointerInfo(); |
| |
| const auto &SDI = |
| static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo()); |
| SDValue MS = |
| SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val, |
| Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{}); |
| |
| // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the |
| // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise |
| // LowerOperationWrapper will complain that the number of results has |
| // changed. |
| return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); |
| } |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, |
| SelectionDAG &DAG) const { |
| unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDLoc dl(Op); |
| switch (IntNo) { |
| default: return SDValue(); // Don't custom lower most intrinsics. |
| case Intrinsic::thread_pointer: { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); |
| } |
| case Intrinsic::aarch64_neon_abs: { |
| EVT Ty = Op.getValueType(); |
| if (Ty == MVT::i64) { |
| SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, |
| Op.getOperand(1)); |
| Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); |
| return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); |
| } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { |
| return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); |
| } else { |
| report_fatal_error("Unexpected type for AArch64 NEON intrinic"); |
| } |
| } |
| case Intrinsic::aarch64_neon_pmull64: { |
| SDValue LHS = Op.getOperand(1); |
| SDValue RHS = Op.getOperand(2); |
| |
| std::optional<uint64_t> LHSLane = |
| getConstantLaneNumOfExtractHalfOperand(LHS); |
| std::optional<uint64_t> RHSLane = |
| getConstantLaneNumOfExtractHalfOperand(RHS); |
| |
| assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1"); |
| assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1"); |
| |
| // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2 |
| // instructions execute on SIMD registers. So canonicalize i64 to v1i64, |
| // which ISel recognizes better. For example, generate a ldr into d* |
| // registers as opposed to a GPR load followed by a fmov. |
| auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane, |
| std::optional<uint64_t> OtherLane, |
| const SDLoc &dl, |
| SelectionDAG &DAG) -> SDValue { |
| // If the operand is an higher half itself, rewrite it to |
| // extract_high_v2i64; this way aarch64_neon_pmull64 could |
| // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}. |
| if (NLane && *NLane == 1) |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, |
| N.getOperand(0), DAG.getConstant(1, dl, MVT::i64)); |
| |
| // Operand N is not a higher half but the other operand is. |
| if (OtherLane && *OtherLane == 1) { |
| // If this operand is a lower half, rewrite it to |
| // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to |
| // align lanes of two operands. A roundtrip sequence (to move from lane |
| // 1 to lane 0) is like this: |
| // mov x8, v0.d[1] |
| // fmov d0, x8 |
| if (NLane && *NLane == 0) |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, |
| DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64, |
| N.getOperand(0), |
| DAG.getConstant(0, dl, MVT::i64)), |
| DAG.getConstant(1, dl, MVT::i64)); |
| |
| // Otherwise just dup from main to all lanes. |
| return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N); |
| } |
| |
| // Neither operand is an extract of higher half, so codegen may just use |
| // the non-high version of PMULL instruction. Use v1i64 to represent i64. |
| assert(N.getValueType() == MVT::i64 && |
| "Intrinsic aarch64_neon_pmull64 requires i64 parameters"); |
| return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N); |
| }; |
| |
| LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG); |
| RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG); |
| |
| return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS); |
| } |
| case Intrinsic::aarch64_neon_smax: |
| return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_umax: |
| return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_smin: |
| return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_umin: |
| return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_scalar_sqxtn: |
| case Intrinsic::aarch64_neon_scalar_sqxtun: |
| case Intrinsic::aarch64_neon_scalar_uqxtn: { |
| assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32); |
| if (Op.getValueType() == MVT::i32) |
| return DAG.getNode(ISD::BITCAST, dl, MVT::i32, |
| DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32, |
| Op.getOperand(0), |
| DAG.getNode(ISD::BITCAST, dl, MVT::f64, |
| Op.getOperand(1)))); |
| return SDValue(); |
| } |
| case Intrinsic::aarch64_sve_whilelo: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, |
| /*IsEqual=*/false); |
| case Intrinsic::aarch64_sve_whilelt: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, |
| /*IsEqual=*/false); |
| case Intrinsic::aarch64_sve_whilels: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, |
| /*IsEqual=*/true); |
| case Intrinsic::aarch64_sve_whilele: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, |
| /*IsEqual=*/true); |
| case Intrinsic::aarch64_sve_whilege: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, |
| /*IsEqual=*/true); |
| case Intrinsic::aarch64_sve_whilegt: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, |
| /*IsEqual=*/false); |
| case Intrinsic::aarch64_sve_whilehs: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, |
| /*IsEqual=*/true); |
| case Intrinsic::aarch64_sve_whilehi: |
| return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, |
| /*IsEqual=*/false); |
| case Intrinsic::aarch64_sve_sunpkhi: |
| return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sunpklo: |
| return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uunpkhi: |
| return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uunpklo: |
| return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_clasta_n: |
| return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_clastb_n: |
| return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_lasta: |
| return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_lastb: |
| return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_rev: |
| return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_tbl: |
| return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_trn1: |
| return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_trn2: |
| return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_uzp1: |
| return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_uzp2: |
| return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_zip1: |
| return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_zip2: |
| return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_splice: |
| return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_ptrue: |
| return getPTrue(DAG, dl, Op.getValueType(), |
| cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); |
| case Intrinsic::aarch64_sve_clz: |
| return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sme_cntsb: |
| return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), |
| DAG.getConstant(1, dl, MVT::i32)); |
| case Intrinsic::aarch64_sme_cntsh: { |
| SDValue One = DAG.getConstant(1, dl, MVT::i32); |
| SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One); |
| return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One); |
| } |
| case Intrinsic::aarch64_sme_cntsw: { |
| SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), |
| DAG.getConstant(1, dl, MVT::i32)); |
| return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, |
| DAG.getConstant(2, dl, MVT::i32)); |
| } |
| case Intrinsic::aarch64_sme_cntsd: { |
| SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), |
| DAG.getConstant(1, dl, MVT::i32)); |
| return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, |
| DAG.getConstant(3, dl, MVT::i32)); |
| } |
| case Intrinsic::aarch64_sve_cnt: { |
| SDValue Data = Op.getOperand(3); |
| // CTPOP only supports integer operands. |
| if (Data.getValueType().isFloatingPoint()) |
| Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); |
| return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Data, Op.getOperand(1)); |
| } |
| case Intrinsic::aarch64_sve_dupq_lane: |
| return LowerDUPQLane(Op, DAG); |
| case Intrinsic::aarch64_sve_convert_from_svbool: |
| if (Op.getValueType() == MVT::aarch64svcount) |
| return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1)); |
| return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); |
| case Intrinsic::aarch64_sve_convert_to_svbool: |
| if (Op.getOperand(1).getValueType() == MVT::aarch64svcount) |
| return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1)); |
| return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); |
| case Intrinsic::aarch64_sve_fneg: |
| return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintp: |
| return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintm: |
| return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frinti: |
| return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintx: |
| return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frinta: |
| return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintn: |
| return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintz: |
| return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_ucvtf: |
| return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_scvtf: |
| return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fcvtzu: |
| return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fcvtzs: |
| return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fsqrt: |
| return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecpx: |
| return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecpe_x: |
| return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecps_x: |
| return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_frsqrte_x: |
| return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frsqrts_x: |
| return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_fabs: |
| return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_abs: |
| return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_neg: |
| return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_insr: { |
| SDValue Scalar = Op.getOperand(2); |
| EVT ScalarTy = Scalar.getValueType(); |
| if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) |
| Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); |
| |
| return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), |
| Op.getOperand(1), Scalar); |
| } |
| case Intrinsic::aarch64_sve_rbit: |
| return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_revb: |
| return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_revh: |
| return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_revw: |
| return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_revd: |
| return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxtb: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxth: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxtw: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxtb: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxth: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxtw: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), |
| Op.getOperand(1)); |
| case Intrinsic::localaddress: { |
| const auto &MF = DAG.getMachineFunction(); |
| const auto *RegInfo = Subtarget->getRegisterInfo(); |
| unsigned Reg = RegInfo->getLocalAddressRegister(MF); |
| return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, |
| Op.getSimpleValueType()); |
| } |
| |
| case Intrinsic::eh_recoverfp: { |
| // FIXME: This needs to be implemented to correctly handle highly aligned |
| // stack objects. For now we simply return the incoming FP. Refer D53541 |
| // for more details. |
| SDValue FnOp = Op.getOperand(1); |
| SDValue IncomingFPOp = Op.getOperand(2); |
| GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); |
| auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); |
| if (!Fn) |
| report_fatal_error( |
| "llvm.eh.recoverfp must take a function as the first argument"); |
| return IncomingFPOp; |
| } |
| |
| case Intrinsic::aarch64_neon_vsri: |
| case Intrinsic::aarch64_neon_vsli: { |
| EVT Ty = Op.getValueType(); |
| |
| if (!Ty.isVector()) |
| report_fatal_error("Unexpected type for aarch64_neon_vsli"); |
| |
| assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); |
| |
| bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; |
| unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; |
| return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), |
| Op.getOperand(3)); |
| } |
| |
| case Intrinsic::aarch64_neon_srhadd: |
| case Intrinsic::aarch64_neon_urhadd: |
| case Intrinsic::aarch64_neon_shadd: |
| case Intrinsic::aarch64_neon_uhadd: { |
| bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || |
| IntNo == Intrinsic::aarch64_neon_shadd); |
| bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || |
| IntNo == Intrinsic::aarch64_neon_urhadd); |
| unsigned Opcode = IsSignedAdd |
| ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) |
| : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), |
| Op.getOperand(2)); |
| } |
| case Intrinsic::aarch64_neon_saddlp: |
| case Intrinsic::aarch64_neon_uaddlp: { |
| unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp |
| ? AArch64ISD::UADDLP |
| : AArch64ISD::SADDLP; |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); |
| } |
| case Intrinsic::aarch64_neon_sdot: |
| case Intrinsic::aarch64_neon_udot: |
| case Intrinsic::aarch64_sve_sdot: |
| case Intrinsic::aarch64_sve_udot: { |
| unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || |
| IntNo == Intrinsic::aarch64_sve_udot) |
| ? AArch64ISD::UDOT |
| : AArch64ISD::SDOT; |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), |
| Op.getOperand(2), Op.getOperand(3)); |
| } |
| case Intrinsic::get_active_lane_mask: { |
| SDValue ID = |
| DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, |
| Op.getOperand(1), Op.getOperand(2)); |
| } |
| case Intrinsic::aarch64_neon_uaddlv: { |
| EVT OpVT = Op.getOperand(1).getValueType(); |
| EVT ResVT = Op.getValueType(); |
| if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || |
| OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) { |
| // In order to avoid insert_subvector, used v4i32 than v2i32. |
| SDValue UADDLV = |
| DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); |
| SDValue EXTRACT_VEC_ELT = |
| DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV, |
| DAG.getConstant(0, dl, MVT::i64)); |
| return EXTRACT_VEC_ELT; |
| } |
| return SDValue(); |
| } |
| } |
| } |
| |
| bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { |
| if (VT.getVectorElementType() == MVT::i8 || |
| VT.getVectorElementType() == MVT::i16) { |
| EltTy = MVT::i32; |
| return true; |
| } |
| return false; |
| } |
| |
| bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend, |
| EVT DataVT) const { |
| const EVT IndexVT = Extend.getOperand(0).getValueType(); |
| // SVE only supports implicit extension of 32-bit indices. |
| if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32) |
| return false; |
| |
| // Indices cannot be smaller than the main data type. |
| if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits()) |
| return false; |
| |
| // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit |
| // element container type, which would violate the previous clause. |
| return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; |
| } |
| |
| bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { |
| EVT ExtVT = ExtVal.getValueType(); |
| if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors()) |
| return false; |
| |
| // It may be worth creating extending masked loads if there are multiple |
| // masked loads using the same predicate. That way we'll end up creating |
| // extending masked loads that may then get split by the legaliser. This |
| // results in just one set of predicate unpacks at the start, instead of |
| // multiple sets of vector unpacks after each load. |
| if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) { |
| if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) { |
| // Disable extending masked loads for fixed-width for now, since the code |
| // quality doesn't look great. |
| if (!ExtVT.isScalableVector()) |
| return false; |
| |
| unsigned NumExtMaskedLoads = 0; |
| for (auto *U : Ld->getMask()->uses()) |
| if (isa<MaskedLoadSDNode>(U)) |
| NumExtMaskedLoads++; |
| |
| if (NumExtMaskedLoads <= 1) |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { |
| std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::GLD1_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::GLD1_UXTW_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::GLD1_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::GLD1_SXTW_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::GLD1_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::GLD1_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, |
| }; |
| auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); |
| return AddrModes.find(Key)->second; |
| } |
| |
| unsigned getSignExtendedGatherOpcode(unsigned Opcode) { |
| switch (Opcode) { |
| default: |
| llvm_unreachable("unimplemented opcode"); |
| return Opcode; |
| case AArch64ISD::GLD1_MERGE_ZERO: |
| return AArch64ISD::GLD1S_MERGE_ZERO; |
| case AArch64ISD::GLD1_IMM_MERGE_ZERO: |
| return AArch64ISD::GLD1S_IMM_MERGE_ZERO; |
| case AArch64ISD::GLD1_UXTW_MERGE_ZERO: |
| return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; |
| case AArch64ISD::GLD1_SXTW_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; |
| case AArch64ISD::GLD1_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; |
| case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; |
| case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, |
| SelectionDAG &DAG) const { |
| MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); |
| |
| SDLoc DL(Op); |
| SDValue Chain = MGT->getChain(); |
| SDValue PassThru = MGT->getPassThru(); |
| SDValue Mask = MGT->getMask(); |
| SDValue BasePtr = MGT->getBasePtr(); |
| SDValue Index = MGT->getIndex(); |
| SDValue Scale = MGT->getScale(); |
| EVT VT = Op.getValueType(); |
| EVT MemVT = MGT->getMemoryVT(); |
| ISD::LoadExtType ExtType = MGT->getExtensionType(); |
| ISD::MemIndexType IndexType = MGT->getIndexType(); |
| |
| // SVE supports zero (and so undef) passthrough values only, everything else |
| // must be handled manually by an explicit select on the load's output. |
| if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) { |
| SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale}; |
| SDValue Load = |
| DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, |
| MGT->getMemOperand(), IndexType, ExtType); |
| SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru); |
| return DAG.getMergeValues({Select, Load.getValue(1)}, DL); |
| } |
| |
| bool IsScaled = MGT->isIndexScaled(); |
| bool IsSigned = MGT->isIndexSigned(); |
| |
| // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else |
| // must be calculated before hand. |
| uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); |
| if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { |
| assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); |
| EVT IndexVT = Index.getValueType(); |
| Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, |
| DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); |
| Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); |
| |
| SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; |
| return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, |
| MGT->getMemOperand(), IndexType, ExtType); |
| } |
| |
| // Lower fixed length gather to a scalable equivalent. |
| if (VT.isFixedLengthVector()) { |
| assert(Subtarget->useSVEForFixedLengthVectors() && |
| "Cannot lower when not using SVE for fixed vectors!"); |
| |
| // NOTE: Handle floating-point as if integer then bitcast the result. |
| EVT DataVT = VT.changeVectorElementTypeToInteger(); |
| MemVT = MemVT.changeVectorElementTypeToInteger(); |
| |
| // Find the smallest integer fixed length vector we can use for the gather. |
| EVT PromotedVT = VT.changeVectorElementType(MVT::i32); |
| if (DataVT.getVectorElementType() == MVT::i64 || |
| Index.getValueType().getVectorElementType() == MVT::i64 || |
| Mask.getValueType().getVectorElementType() == MVT::i64) |
| PromotedVT = VT.changeVectorElementType(MVT::i64); |
| |
| // Promote vector operands except for passthrough, which we know is either |
| // undef or zero, and thus best constructed directly. |
| unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); |
| Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); |
| |
| // A promoted result type forces the need for an extending load. |
| if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD) |
| ExtType = ISD::EXTLOAD; |
| |
| EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); |
| |
| // Convert fixed length vector operands to scalable. |
| MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); |
| Index = convertToScalableVector(DAG, ContainerVT, Index); |
| Mask = convertFixedMaskToScalableVector(Mask, DAG); |
| PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT) |
| : DAG.getConstant(0, DL, ContainerVT); |
| |
| // Emit equivalent scalable vector gather. |
| SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; |
| SDValue Load = |
| DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL, |
| Ops, MGT->getMemOperand(), IndexType, ExtType); |
| |
| // Extract fixed length data then convert to the required result type. |
| SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load); |
| Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result); |
| if (VT.isFloatingPoint()) |
| Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); |
| |
| return DAG.getMergeValues({Result, Load.getValue(1)}, DL); |
| } |
| |
| // Everything else is legal. |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, |
| SelectionDAG &DAG) const { |
| MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); |
| |
| SDLoc DL(Op); |
| SDValue Chain = MSC->getChain(); |
| SDValue StoreVal = MSC->getValue(); |
| SDValue Mask = MSC->getMask(); |
| SDValue BasePtr = MSC->getBasePtr(); |
| SDValue Index = MSC->getIndex(); |
| SDValue Scale = MSC->getScale(); |
| EVT VT = StoreVal.getValueType(); |
| EVT MemVT = MSC->getMemoryVT(); |
| ISD::MemIndexType IndexType = MSC->getIndexType(); |
| bool Truncating = MSC->isTruncatingStore(); |
| |
| bool IsScaled = MSC->isIndexScaled(); |
| bool IsSigned = MSC->isIndexSigned(); |
| |
| // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else |
| // must be calculated before hand. |
| uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); |
| if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { |
| assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); |
| EVT IndexVT = Index.getValueType(); |
| Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, |
| DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); |
| Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); |
| |
| SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; |
| return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, |
| MSC->getMemOperand(), IndexType, Truncating); |
| } |
| |
| // Lower fixed length scatter to a scalable equivalent. |
| if (VT.isFixedLengthVector()) { |
| assert(Subtarget->useSVEForFixedLengthVectors() && |
| "Cannot lower when not using SVE for fixed vectors!"); |
| |
| // Once bitcast we treat floating-point scatters as if integer. |
| if (VT.isFloatingPoint()) { |
| VT = VT.changeVectorElementTypeToInteger(); |
| MemVT = MemVT.changeVectorElementTypeToInteger(); |
| StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal); |
| } |
| |
| // Find the smallest integer fixed length vector we can use for the scatter. |
| EVT PromotedVT = VT.changeVectorElementType(MVT::i32); |
| if (VT.getVectorElementType() == MVT::i64 || |
| Index.getValueType().getVectorElementType() == MVT::i64 || |
| Mask.getValueType().getVectorElementType() == MVT::i64) |
| PromotedVT = VT.changeVectorElementType(MVT::i64); |
| |
| // Promote vector operands. |
| unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); |
| Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); |
| StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal); |
| |
| // A promoted value type forces the need for a truncating store. |
| if (PromotedVT != VT) |
| Truncating = true; |
| |
| EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); |
| |
| // Convert fixed length vector operands to scalable. |
| MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); |
| Index = convertToScalableVector(DAG, ContainerVT, Index); |
| Mask = convertFixedMaskToScalableVector(Mask, DAG); |
| StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal); |
| |
| // Emit equivalent scalable vector scatter. |
| SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; |
| return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, |
| MSC->getMemOperand(), IndexType, Truncating); |
| } |
| |
| // Everything else is legal. |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op); |
| assert(LoadNode && "Expected custom lowering of a masked load node"); |
| EVT VT = Op->getValueType(0); |
| |
| if (useSVEForFixedLengthVectorVT( |
| VT, |
| /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) |
| return LowerFixedLengthVectorMLoadToSVE(Op, DAG); |
| |
| SDValue PassThru = LoadNode->getPassThru(); |
| SDValue Mask = LoadNode->getMask(); |
| |
| if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) |
| return Op; |
| |
| SDValue Load = DAG.getMaskedLoad( |
| VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), |
| LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), |
| LoadNode->getMemOperand(), LoadNode->getAddressingMode(), |
| LoadNode->getExtensionType()); |
| |
| SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); |
| |
| return DAG.getMergeValues({Result, Load.getValue(1)}, DL); |
| } |
| |
| // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. |
| static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, |
| EVT VT, EVT MemVT, |
| SelectionDAG &DAG) { |
| assert(VT.isVector() && "VT should be a vector type"); |
| assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); |
| |
| SDValue Value = ST->getValue(); |
| |
| // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract |
| // the word lane which represent the v4i8 subvector. It optimizes the store |
| // to: |
| // |
| // xtn v0.8b, v0.8h |
| // str s0, [x0] |
| |
| SDValue Undef = DAG.getUNDEF(MVT::i16); |
| SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, |
| {Undef, Undef, Undef, Undef}); |
| |
| SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, |
| Value, UndefVec); |
| SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); |
| |
| Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); |
| SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, |
| Trunc, DAG.getConstant(0, DL, MVT::i64)); |
| |
| return DAG.getStore(ST->getChain(), DL, ExtractTrunc, |
| ST->getBasePtr(), ST->getMemOperand()); |
| } |
| |
| // Custom lowering for any store, vector or scalar and/or default or with |
| // a truncate operations. Currently only custom lower truncate operation |
| // from vector v4i16 to v4i8 or volatile stores of i128. |
| SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc Dl(Op); |
| StoreSDNode *StoreNode = cast<StoreSDNode>(Op); |
| assert (StoreNode && "Can only custom lower store nodes"); |
| |
| SDValue Value = StoreNode->getValue(); |
| |
| EVT VT = Value.getValueType(); |
| EVT MemVT = StoreNode->getMemoryVT(); |
| |
| if (VT.isVector()) { |
| if (useSVEForFixedLengthVectorVT( |
| VT, |
| /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) |
| return LowerFixedLengthVectorStoreToSVE(Op, DAG); |
| |
| unsigned AS = StoreNode->getAddressSpace(); |
| Align Alignment = StoreNode->getAlign(); |
| if (Alignment < MemVT.getStoreSize() && |
| !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, |
| StoreNode->getMemOperand()->getFlags(), |
| nullptr)) { |
| return scalarizeVectorStore(StoreNode, DAG); |
| } |
| |
| if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && |
| MemVT == MVT::v4i8) { |
| return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); |
| } |
| // 256 bit non-temporal stores can be lowered to STNP. Do this as part of |
| // the custom lowering, as there are no un-paired non-temporal stores and |
| // legalization will break up 256 bit inputs. |
| ElementCount EC = MemVT.getVectorElementCount(); |
| if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && |
| EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() && |
| (MemVT.getScalarSizeInBits() == 8u || |
| MemVT.getScalarSizeInBits() == 16u || |
| MemVT.getScalarSizeInBits() == 32u || |
| MemVT.getScalarSizeInBits() == 64u)) { |
| SDValue Lo = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, |
| MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), |
| StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); |
| SDValue Hi = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, |
| MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), |
| StoreNode->getValue(), |
| DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); |
| SDValue Result = DAG.getMemIntrinsicNode( |
| AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), |
| {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, |
| StoreNode->getMemoryVT(), StoreNode->getMemOperand()); |
| return Result; |
| } |
| } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { |
| return LowerStore128(Op, DAG); |
| } else if (MemVT == MVT::i64x8) { |
| SDValue Value = StoreNode->getValue(); |
| assert(Value->getValueType(0) == MVT::i64x8); |
| SDValue Chain = StoreNode->getChain(); |
| SDValue Base = StoreNode->getBasePtr(); |
| EVT PtrVT = Base.getValueType(); |
| for (unsigned i = 0; i < 8; i++) { |
| SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, |
| Value, DAG.getConstant(i, Dl, MVT::i32)); |
| SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, |
| DAG.getConstant(i * 8, Dl, PtrVT)); |
| Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), |
| StoreNode->getOriginalAlign()); |
| } |
| return Chain; |
| } |
| |
| return SDValue(); |
| } |
| |
| /// Lower atomic or volatile 128-bit stores to a single STP instruction. |
| SDValue AArch64TargetLowering::LowerStore128(SDValue Op, |
| SelectionDAG &DAG) const { |
| MemSDNode *StoreNode = cast<MemSDNode>(Op); |
| assert(StoreNode->getMemoryVT() == MVT::i128); |
| assert(StoreNode->isVolatile() || StoreNode->isAtomic()); |
| |
| bool IsStoreRelease = |
| StoreNode->getMergedOrdering() == AtomicOrdering::Release; |
| if (StoreNode->isAtomic()) |
| assert((Subtarget->hasFeature(AArch64::FeatureLSE2) && |
| Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) || |
| StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || |
| StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); |
| |
| SDValue Value = (StoreNode->getOpcode() == ISD::STORE || |
| StoreNode->getOpcode() == ISD::ATOMIC_STORE) |
| ? StoreNode->getOperand(1) |
| : StoreNode->getOperand(2); |
| SDLoc DL(Op); |
| auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64); |
| unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP; |
| if (DAG.getDataLayout().isBigEndian()) |
| std::swap(StoreValue.first, StoreValue.second); |
| SDValue Result = DAG.getMemIntrinsicNode( |
| Opcode, DL, DAG.getVTList(MVT::Other), |
| {StoreNode->getChain(), StoreValue.first, StoreValue.second, |
| StoreNode->getBasePtr()}, |
| StoreNode->getMemoryVT(), StoreNode->getMemOperand()); |
| return Result; |
| } |
| |
| SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| LoadSDNode *LoadNode = cast<LoadSDNode>(Op); |
| assert(LoadNode && "Expected custom lowering of a load node"); |
| |
| if (LoadNode->getMemoryVT() == MVT::i64x8) { |
| SmallVector<SDValue, 8> Ops; |
| SDValue Base = LoadNode->getBasePtr(); |
| SDValue Chain = LoadNode->getChain(); |
| EVT PtrVT = Base.getValueType(); |
| for (unsigned i = 0; i < 8; i++) { |
| SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, |
| DAG.getConstant(i * 8, DL, PtrVT)); |
| SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, |
| LoadNode->getPointerInfo(), |
| LoadNode->getOriginalAlign()); |
| Ops.push_back(Part); |
| Chain = SDValue(Part.getNode(), 1); |
| } |
| SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); |
| return DAG.getMergeValues({Loaded, Chain}, DL); |
| } |
| |
| // Custom lowering for extending v4i8 vector loads. |
| EVT VT = Op->getValueType(0); |
| assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); |
| |
| if (LoadNode->getMemoryVT() != MVT::v4i8) |
| return SDValue(); |
| |
| unsigned ExtType; |
| if (LoadNode->getExtensionType() == ISD::SEXTLOAD) |
| ExtType = ISD::SIGN_EXTEND; |
| else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || |
| LoadNode->getExtensionType() == ISD::EXTLOAD) |
| ExtType = ISD::ZERO_EXTEND; |
| else |
| return SDValue(); |
| |
| SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), |
| LoadNode->getBasePtr(), MachinePointerInfo()); |
| SDValue Chain = Load.getValue(1); |
| SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); |
| SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); |
| SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); |
| Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, |
| DAG.getConstant(0, DL, MVT::i64)); |
| if (VT == MVT::v4i32) |
| Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); |
| return DAG.getMergeValues({Ext, Chain}, DL); |
| } |
| |
| // Generate SUBS and CSEL for integer abs. |
| SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { |
| MVT VT = Op.getSimpleValueType(); |
| |
| if (VT.isVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); |
| |
| SDLoc DL(Op); |
| SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), |
| Op.getOperand(0)); |
| // Generate SUBS & CSEL. |
| SDValue Cmp = |
| DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), |
| Op.getOperand(0), DAG.getConstant(0, DL, VT)); |
| return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, |
| DAG.getConstant(AArch64CC::PL, DL, MVT::i32), |
| Cmp.getValue(1)); |
| } |
| |
| static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { |
| SDValue Chain = Op.getOperand(0); |
| SDValue Cond = Op.getOperand(1); |
| SDValue Dest = Op.getOperand(2); |
| |
| AArch64CC::CondCode CC; |
| if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { |
| SDLoc dl(Op); |
| SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, |
| Cmp); |
| } |
| |
| return SDValue(); |
| } |
| |
| // Treat FSHR with constant shifts as legal operation, otherwise it is expanded |
| // FSHL is converted to FSHR before deciding what to do with it |
| static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) { |
| SDValue Shifts = Op.getOperand(2); |
| // Check if the shift amount is a constant |
| // If opcode is FSHL, convert it to FSHR |
| if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) { |
| SDLoc DL(Op); |
| MVT VT = Op.getSimpleValueType(); |
| |
| if (Op.getOpcode() == ISD::FSHL) { |
| unsigned int NewShiftNo = |
| VT.getFixedSizeInBits() - ShiftNo->getZExtValue(); |
| return DAG.getNode( |
| ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1), |
| DAG.getConstant(NewShiftNo, DL, Shifts.getValueType())); |
| } else if (Op.getOpcode() == ISD::FSHR) { |
| return Op; |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerOperation(SDValue Op, |
| SelectionDAG &DAG) const { |
| LLVM_DEBUG(dbgs() << "Custom lowering: "); |
| LLVM_DEBUG(Op.dump()); |
| |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("unimplemented operand"); |
| return SDValue(); |
| case ISD::BITCAST: |
| return LowerBITCAST(Op, DAG); |
| case ISD::GlobalAddress: |
| return LowerGlobalAddress(Op, DAG); |
| case ISD::GlobalTLSAddress: |
| return LowerGlobalTLSAddress(Op, DAG); |
| case ISD::SETCC: |
| case ISD::STRICT_FSETCC: |
| case ISD::STRICT_FSETCCS: |
| return LowerSETCC(Op, DAG); |
| case ISD::SETCCCARRY: |
| return LowerSETCCCARRY(Op, DAG); |
| case ISD::BRCOND: |
| return LowerBRCOND(Op, DAG); |
| case ISD::BR_CC: |
| return LowerBR_CC(Op, DAG); |
| case ISD::SELECT: |
| return LowerSELECT(Op, DAG); |
| case ISD::SELECT_CC: |
| return LowerSELECT_CC(Op, DAG); |
| case ISD::JumpTable: |
| return LowerJumpTable(Op, DAG); |
| case ISD::BR_JT: |
| return LowerBR_JT(Op, DAG); |
| case ISD::ConstantPool: |
| return LowerConstantPool(Op, DAG); |
| case ISD::BlockAddress: |
| return LowerBlockAddress(Op, DAG); |
| case ISD::VASTART: |
| return LowerVASTART(Op, DAG); |
| case ISD::VACOPY: |
| return LowerVACOPY(Op, DAG); |
| case ISD::VAARG: |
| return LowerVAARG(Op, DAG); |
| case ISD::UADDO_CARRY: |
| return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/); |
| case ISD::USUBO_CARRY: |
| return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/); |
| case ISD::SADDO_CARRY: |
| return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/); |
| case ISD::SSUBO_CARRY: |
| return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/); |
| case ISD::SADDO: |
| case ISD::UADDO: |
| case ISD::SSUBO: |
| case ISD::USUBO: |
| case ISD::SMULO: |
| case ISD::UMULO: |
| return LowerXALUO(Op, DAG); |
| case ISD::FADD: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); |
| case ISD::FSUB: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); |
| case ISD::FMUL: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); |
| case ISD::FMA: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); |
| case ISD::FDIV: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); |
| case ISD::FNEG: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); |
| case ISD::FCEIL: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); |
| case ISD::FFLOOR: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); |
| case ISD::FNEARBYINT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); |
| case ISD::FRINT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); |
| case ISD::FROUND: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); |
| case ISD::FROUNDEVEN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); |
| case ISD::FTRUNC: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); |
| case ISD::FSQRT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); |
| case ISD::FABS: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); |
| case ISD::FP_ROUND: |
| case ISD::STRICT_FP_ROUND: |
| return LowerFP_ROUND(Op, DAG); |
| case ISD::FP_EXTEND: |
| return LowerFP_EXTEND(Op, DAG); |
| case ISD::FRAMEADDR: |
| return LowerFRAMEADDR(Op, DAG); |
| case ISD::SPONENTRY: |
| return LowerSPONENTRY(Op, DAG); |
| case ISD::RETURNADDR: |
| return LowerRETURNADDR(Op, DAG); |
| case ISD::ADDROFRETURNADDR: |
| return LowerADDROFRETURNADDR(Op, DAG); |
| case ISD::CONCAT_VECTORS: |
| return LowerCONCAT_VECTORS(Op, DAG); |
| case ISD::INSERT_VECTOR_ELT: |
| return LowerINSERT_VECTOR_ELT(Op, DAG); |
| case ISD::EXTRACT_VECTOR_ELT: |
| return LowerEXTRACT_VECTOR_ELT(Op, DAG); |
| case ISD::BUILD_VECTOR: |
| return LowerBUILD_VECTOR(Op, DAG); |
| case ISD::ZERO_EXTEND_VECTOR_INREG: |
| return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG); |
| case ISD::VECTOR_SHUFFLE: |
| return LowerVECTOR_SHUFFLE(Op, DAG); |
| case ISD::SPLAT_VECTOR: |
| return LowerSPLAT_VECTOR(Op, DAG); |
| case ISD::EXTRACT_SUBVECTOR: |
| return LowerEXTRACT_SUBVECTOR(Op, DAG); |
| case ISD::INSERT_SUBVECTOR: |
| return LowerINSERT_SUBVECTOR(Op, DAG); |
| case ISD::SDIV: |
| case ISD::UDIV: |
| return LowerDIV(Op, DAG); |
| case ISD::SMIN: |
| case ISD::UMIN: |
| case ISD::SMAX: |
| case ISD::UMAX: |
| return LowerMinMax(Op, DAG); |
| case ISD::SRA: |
| case ISD::SRL: |
| case ISD::SHL: |
| return LowerVectorSRA_SRL_SHL(Op, DAG); |
| case ISD::SHL_PARTS: |
| case ISD::SRL_PARTS: |
| case ISD::SRA_PARTS: |
| return LowerShiftParts(Op, DAG); |
| case ISD::CTPOP: |
| case ISD::PARITY: |
| return LowerCTPOP_PARITY(Op, DAG); |
| case ISD::FCOPYSIGN: |
| return LowerFCOPYSIGN(Op, DAG); |
| case ISD::OR: |
| return LowerVectorOR(Op, DAG); |
| case ISD::XOR: |
| return LowerXOR(Op, DAG); |
| case ISD::PREFETCH: |
| return LowerPREFETCH(Op, DAG); |
| case ISD::SINT_TO_FP: |
| case ISD::UINT_TO_FP: |
| case ISD::STRICT_SINT_TO_FP: |
| case ISD::STRICT_UINT_TO_FP: |
| return LowerINT_TO_FP(Op, DAG); |
| case ISD::FP_TO_SINT: |
| case ISD::FP_TO_UINT: |
| case ISD::STRICT_FP_TO_SINT: |
| case ISD::STRICT_FP_TO_UINT: |
| return LowerFP_TO_INT(Op, DAG); |
| case ISD::FP_TO_SINT_SAT: |
| case ISD::FP_TO_UINT_SAT: |
| return LowerFP_TO_INT_SAT(Op, DAG); |
| case ISD::FSINCOS: |
| return LowerFSINCOS(Op, DAG); |
| case ISD::GET_ROUNDING: |
| return LowerGET_ROUNDING(Op, DAG); |
| case ISD::SET_ROUNDING: |
| return LowerSET_ROUNDING(Op, DAG); |
| case ISD::MUL: |
| return LowerMUL(Op, DAG); |
| case ISD::MULHS: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); |
| case ISD::MULHU: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); |
| case ISD::INTRINSIC_W_CHAIN: |
| return LowerINTRINSIC_W_CHAIN(Op, DAG); |
| case ISD::INTRINSIC_WO_CHAIN: |
| return LowerINTRINSIC_WO_CHAIN(Op, DAG); |
| case ISD::INTRINSIC_VOID: |
| return LowerINTRINSIC_VOID(Op, DAG); |
| case ISD::ATOMIC_STORE: |
| if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) { |
| assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3()); |
| return LowerStore128(Op, DAG); |
| } |
| return SDValue(); |
| case ISD::STORE: |
| return LowerSTORE(Op, DAG); |
| case ISD::MSTORE: |
| return LowerFixedLengthVectorMStoreToSVE(Op, DAG); |
| case ISD::MGATHER: |
| return LowerMGATHER(Op, DAG); |
| case ISD::MSCATTER: |
| return LowerMSCATTER(Op, DAG); |
| case ISD::VECREDUCE_SEQ_FADD: |
| return LowerVECREDUCE_SEQ_FADD(Op, DAG); |
| case ISD::VECREDUCE_ADD: |
| case ISD::VECREDUCE_AND: |
| case ISD::VECREDUCE_OR: |
| case ISD::VECREDUCE_XOR: |
| case ISD::VECREDUCE_SMAX: |
| case ISD::VECREDUCE_SMIN: |
| case ISD::VECREDUCE_UMAX: |
| case ISD::VECREDUCE_UMIN: |
| case ISD::VECREDUCE_FADD: |
| case ISD::VECREDUCE_FMAX: |
| case ISD::VECREDUCE_FMIN: |
| case ISD::VECREDUCE_FMAXIMUM: |
| case ISD::VECREDUCE_FMINIMUM: |
| return LowerVECREDUCE(Op, DAG); |
| case ISD::ATOMIC_LOAD_AND: |
| return LowerATOMIC_LOAD_AND(Op, DAG); |
| case ISD::DYNAMIC_STACKALLOC: |
| return LowerDYNAMIC_STACKALLOC(Op, DAG); |
| case ISD::VSCALE: |
| return LowerVSCALE(Op, DAG); |
| case ISD::ANY_EXTEND: |
| case ISD::SIGN_EXTEND: |
| case ISD::ZERO_EXTEND: |
| return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); |
| case ISD::SIGN_EXTEND_INREG: { |
| // Only custom lower when ExtraVT has a legal byte based element type. |
| EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| EVT ExtraEltVT = ExtraVT.getVectorElementType(); |
| if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && |
| (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) |
| return SDValue(); |
| |
| return LowerToPredicatedOp(Op, DAG, |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); |
| } |
| case ISD::TRUNCATE: |
| return LowerTRUNCATE(Op, DAG); |
| case ISD::MLOAD: |
| return LowerMLOAD(Op, DAG); |
| case ISD::LOAD: |
| if (useSVEForFixedLengthVectorVT(Op.getValueType(), |
| !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthVectorLoadToSVE(Op, DAG); |
| return LowerLOAD(Op, DAG); |
| case ISD::ADD: |
| case ISD::AND: |
| case ISD::SUB: |
| return LowerToScalableOp(Op, DAG); |
| case ISD::FMAXIMUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); |
| case ISD::FMAXNUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); |
| case ISD::FMINIMUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); |
| case ISD::FMINNUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); |
| case ISD::VSELECT: |
| return LowerFixedLengthVectorSelectToSVE(Op, DAG); |
| case ISD::ABS: |
| return LowerABS(Op, DAG); |
| case ISD::ABDS: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); |
| case ISD::ABDU: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); |
| case ISD::AVGFLOORS: |
| return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED); |
| case ISD::AVGFLOORU: |
| return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED); |
| case ISD::AVGCEILS: |
| return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED); |
| case ISD::AVGCEILU: |
| return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED); |
| case ISD::BITREVERSE: |
| return LowerBitreverse(Op, DAG); |
| case ISD::BSWAP: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); |
| case ISD::CTLZ: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); |
| case ISD::CTTZ: |
| return LowerCTTZ(Op, DAG); |
| case ISD::VECTOR_SPLICE: |
| return LowerVECTOR_SPLICE(Op, DAG); |
| case ISD::VECTOR_DEINTERLEAVE: |
| return LowerVECTOR_DEINTERLEAVE(Op, DAG); |
| case ISD::VECTOR_INTERLEAVE: |
| return LowerVECTOR_INTERLEAVE(Op, DAG); |
| case ISD::LROUND: |
| case ISD::LLROUND: |
| case ISD::LRINT: |
| case ISD::LLRINT: { |
| assert(Op.getOperand(0).getValueType() == MVT::f16 && |
| "Expected custom lowering of rounding operations only for f16"); |
| SDLoc DL(Op); |
| SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0)); |
| return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext); |
| } |
| case ISD::STRICT_LROUND: |
| case ISD::STRICT_LLROUND: |
| case ISD::STRICT_LRINT: |
| case ISD::STRICT_LLRINT: { |
| assert(Op.getOperand(1).getValueType() == MVT::f16 && |
| "Expected custom lowering of rounding operations only for f16"); |
| SDLoc DL(Op); |
| SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, |
| {Op.getOperand(0), Op.getOperand(1)}); |
| return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, |
| {Ext.getValue(1), Ext.getValue(0)}); |
| } |
| case ISD::WRITE_REGISTER: { |
| assert(Op.getOperand(2).getValueType() == MVT::i128 && |
| "WRITE_REGISTER custom lowering is only for 128-bit sysregs"); |
| SDLoc DL(Op); |
| |
| SDValue Chain = Op.getOperand(0); |
| SDValue SysRegName = Op.getOperand(1); |
| std::pair<SDValue, SDValue> Pair = |
| DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64); |
| |
| // chain = MSRR(chain, sysregname, lo, hi) |
| SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain, |
| SysRegName, Pair.first, Pair.second); |
| |
| return Result; |
| } |
| case ISD::FSHL: |
| case ISD::FSHR: |
| return LowerFunnelShift(Op, DAG); |
| } |
| } |
| |
| bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { |
| return !Subtarget->useSVEForFixedLengthVectors(); |
| } |
| |
| bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( |
| EVT VT, bool OverrideNEON) const { |
| if (!VT.isFixedLengthVector() || !VT.isSimple()) |
| return false; |
| |
| // Don't use SVE for vectors we cannot scalarize if required. |
| switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { |
| // Fixed length predicates should be promoted to i8. |
| // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. |
| case MVT::i1: |
| default: |
| return false; |
| case MVT::i8: |
| case MVT::i16: |
| case MVT::i32: |
| case MVT::i64: |
| case MVT::f16: |
| case MVT::f32: |
| case MVT::f64: |
| break; |
| } |
| |
| // NEON-sized vectors can be emulated using SVE instructions. |
| if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) |
| return Subtarget->hasSVEorSME(); |
| |
| // Ensure NEON MVTs only belong to a single register class. |
| if (VT.getFixedSizeInBits() <= 128) |
| return false; |
| |
| // Ensure wider than NEON code generation is enabled. |
| if (!Subtarget->useSVEForFixedLengthVectors()) |
| return false; |
| |
| // Don't use SVE for types that don't fit. |
| if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) |
| return false; |
| |
| // TODO: Perhaps an artificial restriction, but worth having whilst getting |
| // the base fixed length SVE support in place. |
| if (!VT.isPow2VectorType()) |
| return false; |
| |
| return true; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Calling Convention Implementation |
| //===----------------------------------------------------------------------===// |
| |
| static unsigned getIntrinsicID(const SDNode *N) { |
| unsigned Opcode = N->getOpcode(); |
| switch (Opcode) { |
| default: |
| return Intrinsic::not_intrinsic; |
| case ISD::INTRINSIC_WO_CHAIN: { |
| unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); |
| if (IID < Intrinsic::num_intrinsics) |
| return IID; |
| return Intrinsic::not_intrinsic; |
| } |
| } |
| } |
| |
| bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, |
| SDValue N1) const { |
| if (!N0.hasOneUse()) |
| return false; |
| |
| unsigned IID = getIntrinsicID(N1.getNode()); |
| // Avoid reassociating expressions that can be lowered to smlal/umlal. |
| if (IID == Intrinsic::aarch64_neon_umull || |
| N1.getOpcode() == AArch64ISD::UMULL || |
| IID == Intrinsic::aarch64_neon_smull || |
| N1.getOpcode() == AArch64ISD::SMULL) |
| return N0.getOpcode() != ISD::ADD; |
| |
| return true; |
| } |
| |
| /// Selects the correct CCAssignFn for a given CallingConvention value. |
| CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, |
| bool IsVarArg) const { |
| switch (CC) { |
| default: |
| report_fatal_error("Unsupported calling convention."); |
| case CallingConv::WebKit_JS: |
| return CC_AArch64_WebKit_JS; |
| case CallingConv::GHC: |
| return CC_AArch64_GHC; |
| case CallingConv::C: |
| case CallingConv::Fast: |
| case CallingConv::PreserveMost: |
| case CallingConv::PreserveAll: |
| case CallingConv::CXX_FAST_TLS: |
| case CallingConv::Swift: |
| case CallingConv::SwiftTail: |
| case CallingConv::Tail: |
| if (Subtarget->isTargetWindows() && IsVarArg) { |
| if (Subtarget->isWindowsArm64EC()) |
| return CC_AArch64_Arm64EC_VarArg; |
| return CC_AArch64_Win64_VarArg; |
| } |
| if (!Subtarget->isTargetDarwin()) |
| return CC_AArch64_AAPCS; |
| if (!IsVarArg) |
| return CC_AArch64_DarwinPCS; |
| return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg |
| : CC_AArch64_DarwinPCS_VarArg; |
| case CallingConv::Win64: |
| if (IsVarArg) { |
| if (Subtarget->isWindowsArm64EC()) |
| return CC_AArch64_Arm64EC_VarArg; |
| return CC_AArch64_Win64_VarArg; |
| } |
| return CC_AArch64_AAPCS; |
| case CallingConv::CFGuard_Check: |
| return CC_AArch64_Win64_CFGuard_Check; |
| case CallingConv::AArch64_VectorCall: |
| case CallingConv::AArch64_SVE_VectorCall: |
| case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: |
| case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: |
| return CC_AArch64_AAPCS; |
| } |
| } |
| |
| CCAssignFn * |
| AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { |
| return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS |
| : RetCC_AArch64_AAPCS; |
| } |
| |
| |
| unsigned |
| AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| |
| // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) |
| SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, |
| DAG.getConstant(1, DL, MVT::i32)); |
| SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); |
| SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); |
| SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); |
| Chain = Buffer.getValue(1); |
| MFI.CreateVariableSizedObject(Align(1), nullptr); |
| |
| // Allocate an additional TPIDR2 object on the stack (16 bytes) |
| unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); |
| |
| // Store the buffer pointer to the TPIDR2 stack object. |
| MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); |
| SDValue Ptr = DAG.getFrameIndex( |
| TPIDR2Obj, |
| DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); |
| Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); |
| |
| return TPIDR2Obj; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFormalArguments( |
| SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const Function &F = MF.getFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv()); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| SmallVector<ISD::OutputArg, 4> Outs; |
| GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs, |
| DAG.getTargetLoweringInfo(), MF.getDataLayout()); |
| if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); })) |
| FuncInfo->setIsSVECC(true); |
| |
| // Assign locations to all of the incoming arguments. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| DenseMap<unsigned, SDValue> CopiedRegs; |
| CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| // At this point, Ins[].VT may already be promoted to i32. To correctly |
| // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and |
| // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. |
| // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here |
| // we use a special version of AnalyzeFormalArguments to pass in ValVT and |
| // LocVT. |
| unsigned NumArgs = Ins.size(); |
| Function::const_arg_iterator CurOrigArg = F.arg_begin(); |
| unsigned CurArgIdx = 0; |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ValVT = Ins[i].VT; |
| if (Ins[i].isOrigArg()) { |
| std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); |
| CurArgIdx = Ins[i].getOrigArgIndex(); |
| |
| // Get type of the original argument. |
| EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), |
| /*AllowUnknown*/ true); |
| MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; |
| // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. |
| if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) |
| ValVT = MVT::i8; |
| else if (ActualMVT == MVT::i16) |
| ValVT = MVT::i16; |
| } |
| bool UseVarArgCC = false; |
| if (IsWin64) |
| UseVarArgCC = isVarArg; |
| CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); |
| bool Res = |
| AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); |
| assert(!Res && "Call operand has unhandled type"); |
| (void)Res; |
| } |
| |
| SMEAttrs Attrs(MF.getFunction()); |
| bool IsLocallyStreaming = |
| !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); |
| assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); |
| SDValue Glue = Chain.getValue(1); |
| |
| SmallVector<SDValue, 16> ArgValues; |
| unsigned ExtraArgLocs = 0; |
| for (unsigned i = 0, e = Ins.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; |
| |
| if (Ins[i].Flags.isByVal()) { |
| // Byval is used for HFAs in the PCS, but the system should work in a |
| // non-compliant manner for larger structs. |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| int Size = Ins[i].Flags.getByValSize(); |
| unsigned NumRegs = (Size + 7) / 8; |
| |
| // FIXME: This works on big-endian for composite byvals, which are the common |
| // case. It should also work for fundamental types too. |
| unsigned FrameIdx = |
| MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); |
| SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); |
| InVals.push_back(FrameIdxN); |
| |
| continue; |
| } |
| |
| if (Ins[i].Flags.isSwiftAsync()) |
| MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); |
| |
| SDValue ArgValue; |
| if (VA.isRegLoc()) { |
| // Arguments stored in registers. |
| EVT RegVT = VA.getLocVT(); |
| const TargetRegisterClass *RC; |
| |
| if (RegVT == MVT::i32) |
| RC = &AArch64::GPR32RegClass; |
| else if (RegVT == MVT::i64) |
| RC = &AArch64::GPR64RegClass; |
| else if (RegVT == MVT::f16 || RegVT == MVT::bf16) |
| RC = &AArch64::FPR16RegClass; |
| else if (RegVT == MVT::f32) |
| RC = &AArch64::FPR32RegClass; |
| else if (RegVT == MVT::f64 || RegVT.is64BitVector()) |
| RC = &AArch64::FPR64RegClass; |
| else if (RegVT == MVT::f128 || RegVT.is128BitVector()) |
| RC = &AArch64::FPR128RegClass; |
| else if (RegVT.isScalableVector() && |
| RegVT.getVectorElementType() == MVT::i1) { |
| FuncInfo->setIsSVECC(true); |
| RC = &AArch64::PPRRegClass; |
| } else if (RegVT == MVT::aarch64svcount) { |
| FuncInfo->setIsSVECC(true); |
| RC = &AArch64::PPRRegClass; |
| } else if (RegVT.isScalableVector()) { |
| FuncInfo->setIsSVECC(true); |
| RC = &AArch64::ZPRRegClass; |
| } else |
| llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); |
| |
| // Transform the arguments in physical registers into virtual ones. |
| Register Reg = MF.addLiveIn(VA.getLocReg(), RC); |
| |
| if (IsLocallyStreaming) { |
| // LocallyStreamingFunctions must insert the SMSTART in the correct |
| // position, so we use Glue to ensure no instructions can be scheduled |
| // between the chain of: |
| // t0: ch,glue = EntryNode |
| // t1: res,ch,glue = CopyFromReg |
| // ... |
| // tn: res,ch,glue = CopyFromReg t(n-1), .. |
| // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 |
| // ^^^^^^ |
| // This will be the new Chain/Root node. |
| ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); |
| Glue = ArgValue.getValue(2); |
| } else |
| ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); |
| |
| // If this is an 8, 16 or 32-bit value, it is really passed promoted |
| // to 64 bits. Insert an assert[sz]ext to capture this, then |
| // truncate to the right size. |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::Indirect: |
| assert( |
| (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) && |
| "Indirect arguments should be scalable on most subtargets"); |
| break; |
| case CCValAssign::BCvt: |
| ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); |
| break; |
| case CCValAssign::AExt: |
| case CCValAssign::SExt: |
| case CCValAssign::ZExt: |
| break; |
| case CCValAssign::AExtUpper: |
| ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, |
| DAG.getConstant(32, DL, RegVT)); |
| ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); |
| break; |
| } |
| } else { // VA.isRegLoc() |
| assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); |
| unsigned ArgOffset = VA.getLocMemOffset(); |
| unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect |
| ? VA.getLocVT().getSizeInBits() |
| : VA.getValVT().getSizeInBits()) / 8; |
| |
| uint32_t BEAlign = 0; |
| if (!Subtarget->isLittleEndian() && ArgSize < 8 && |
| !Ins[i].Flags.isInConsecutiveRegs()) |
| BEAlign = 8 - ArgSize; |
| |
| SDValue FIN; |
| MachinePointerInfo PtrInfo; |
| if (isVarArg && Subtarget->isWindowsArm64EC()) { |
| // In the ARM64EC varargs convention, fixed arguments on the stack are |
| // accessed relative to x4, not sp. |
| unsigned ObjOffset = ArgOffset + BEAlign; |
| Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); |
| FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, |
| DAG.getConstant(ObjOffset, DL, MVT::i64)); |
| PtrInfo = MachinePointerInfo::getUnknownStack(MF); |
| } else { |
| int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); |
| |
| // Create load nodes to retrieve arguments from the stack. |
| FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); |
| PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
| } |
| |
| // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) |
| ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; |
| MVT MemVT = VA.getValVT(); |
| |
| switch (VA.getLocInfo()) { |
| default: |
| break; |
| case CCValAssign::Trunc: |
| case CCValAssign::BCvt: |
| MemVT = VA.getLocVT(); |
| break; |
| case CCValAssign::Indirect: |
| assert((VA.getValVT().isScalableVector() || |
| Subtarget->isWindowsArm64EC()) && |
| "Indirect arguments should be scalable on most subtargets"); |
| MemVT = VA.getLocVT(); |
| break; |
| case CCValAssign::SExt: |
| ExtType = ISD::SEXTLOAD; |
| break; |
| case CCValAssign::ZExt: |
| ExtType = ISD::ZEXTLOAD; |
| break; |
| case CCValAssign::AExt: |
| ExtType = ISD::EXTLOAD; |
| break; |
| } |
| |
| ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo, |
| MemVT); |
| } |
| |
| if (VA.getLocInfo() == CCValAssign::Indirect) { |
| assert((VA.getValVT().isScalableVT() || |
| Subtarget->isWindowsArm64EC()) && |
| "Indirect arguments should be scalable on most subtargets"); |
| |
| uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); |
| unsigned NumParts = 1; |
| if (Ins[i].Flags.isInConsecutiveRegs()) { |
| assert(!Ins[i].Flags.isInConsecutiveRegsLast()); |
| while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) |
| ++NumParts; |
| } |
| |
| MVT PartLoad = VA.getValVT(); |
| SDValue Ptr = ArgValue; |
| |
| // Ensure we generate all loads for each tuple part, whilst updating the |
| // pointer after each load correctly using vscale. |
| while (NumParts > 0) { |
| ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); |
| InVals.push_back(ArgValue); |
| NumParts--; |
| if (NumParts > 0) { |
| SDValue BytesIncrement; |
| if (PartLoad.isScalableVector()) { |
| BytesIncrement = DAG.getVScale( |
| DL, Ptr.getValueType(), |
| APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); |
| } else { |
| BytesIncrement = DAG.getConstant( |
| APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, |
| Ptr.getValueType()); |
| } |
| SDNodeFlags Flags; |
| Flags.setNoUnsignedWrap(true); |
| Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, |
| BytesIncrement, Flags); |
| ExtraArgLocs++; |
| i++; |
| } |
| } |
| } else { |
| if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) |
| ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), |
| ArgValue, DAG.getValueType(MVT::i32)); |
| |
| // i1 arguments are zero-extended to i8 by the caller. Emit a |
| // hint to reflect this. |
| if (Ins[i].isOrigArg()) { |
| Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex()); |
| if (OrigArg->getType()->isIntegerTy(1)) { |
| if (!Ins[i].Flags.isZExt()) { |
| ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, |
| ArgValue.getValueType(), ArgValue); |
| } |
| } |
| } |
| |
| InVals.push_back(ArgValue); |
| } |
| } |
| assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); |
| |
| // Insert the SMSTART if this is a locally streaming function and |
| // make sure it is Glued to the last CopyFromReg value. |
| if (IsLocallyStreaming) { |
| Chain = |
| changeStreamingMode(DAG, DL, /*Enable*/ true, DAG.getRoot(), Glue, |
| DAG.getConstant(0, DL, MVT::i64), /*Entry*/ true); |
| |
| // Ensure that the SMSTART happens after the CopyWithChain such that its |
| // chain result is used. |
| for (unsigned I=0; I<InVals.size(); ++I) { |
| Register Reg = MF.getRegInfo().createVirtualRegister( |
| getRegClassFor(InVals[I].getValueType().getSimpleVT())); |
| Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]); |
| InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg, |
| InVals[I].getValueType()); |
| } |
| } |
| |
| // varargs |
| if (isVarArg) { |
| if (!Subtarget->isTargetDarwin() || IsWin64) { |
| // The AAPCS variadic function ABI is identical to the non-variadic |
| // one. As a result there may be more arguments in registers and we should |
| // save them for future reference. |
| // Win64 variadic functions also pass arguments in registers, but all float |
| // arguments are passed in integer registers. |
| saveVarArgRegisters(CCInfo, DAG, DL, Chain); |
| } |
| |
| // This will point to the next argument passed via stack. |
| unsigned VarArgsOffset = CCInfo.getStackSize(); |
| // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 |
| VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8); |
| FuncInfo->setVarArgsStackOffset(VarArgsOffset); |
| FuncInfo->setVarArgsStackIndex( |
| MFI.CreateFixedObject(4, VarArgsOffset, true)); |
| |
| if (MFI.hasMustTailInVarArgFunc()) { |
| SmallVector<MVT, 2> RegParmTypes; |
| RegParmTypes.push_back(MVT::i64); |
| RegParmTypes.push_back(MVT::f128); |
| // Compute the set of forwarded registers. The rest are scratch. |
| SmallVectorImpl<ForwardedRegister> &Forwards = |
| FuncInfo->getForwardedMustTailRegParms(); |
| CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, |
| CC_AArch64_AAPCS); |
| |
| // Conservatively forward X8, since it might be used for aggregate return. |
| if (!CCInfo.isAllocated(AArch64::X8)) { |
| Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); |
| Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); |
| } |
| } |
| } |
| |
| // On Windows, InReg pointers must be returned, so record the pointer in a |
| // virtual register at the start of the function so it can be returned in the |
| // epilogue. |
| if (IsWin64) { |
| for (unsigned I = 0, E = Ins.size(); I != E; ++I) { |
| if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) { |
| assert(!FuncInfo->getSRetReturnReg()); |
| |
| MVT PtrTy = getPointerTy(DAG.getDataLayout()); |
| Register Reg = |
| MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); |
| FuncInfo->setSRetReturnReg(Reg); |
| |
| SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); |
| break; |
| } |
| } |
| } |
| |
| unsigned StackArgSize = CCInfo.getStackSize(); |
| bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
| if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { |
| // This is a non-standard ABI so by fiat I say we're allowed to make full |
| // use of the stack area to be popped, which must be aligned to 16 bytes in |
| // any case: |
| StackArgSize = alignTo(StackArgSize, 16); |
| |
| // If we're expected to restore the stack (e.g. fastcc) then we'll be adding |
| // a multiple of 16. |
| FuncInfo->setArgumentStackToRestore(StackArgSize); |
| |
| // This realignment carries over to the available bytes below. Our own |
| // callers will guarantee the space is free by giving an aligned value to |
| // CALLSEQ_START. |
| } |
| // Even if we're not expected to free up the space, it's useful to know how |
| // much is there while considering tail calls (because we can reuse it). |
| FuncInfo->setBytesInStackArgArea(StackArgSize); |
| |
| if (Subtarget->hasCustomCallingConv()) |
| Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); |
| |
| // Conservatively assume the function requires the lazy-save mechanism. |
| if (SMEAttrs(MF.getFunction()).hasZAState()) { |
| unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); |
| FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); |
| } |
| |
| return Chain; |
| } |
| |
| void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, |
| SelectionDAG &DAG, |
| const SDLoc &DL, |
| SDValue &Chain) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); |
| |
| SmallVector<SDValue, 8> MemOps; |
| |
| auto GPRArgRegs = AArch64::getGPRArgRegs(); |
| unsigned NumGPRArgRegs = GPRArgRegs.size(); |
| if (Subtarget->isWindowsArm64EC()) { |
| // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs |
| // functions. |
| NumGPRArgRegs = 4; |
| } |
| unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); |
| |
| unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); |
| int GPRIdx = 0; |
| if (GPRSaveSize != 0) { |
| if (IsWin64) { |
| GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); |
| if (GPRSaveSize & 15) |
| // The extra size here, if triggered, will always be 8. |
| MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); |
| } else |
| GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); |
| |
| SDValue FIN; |
| if (Subtarget->isWindowsArm64EC()) { |
| // With the Arm64EC ABI, we reserve the save area as usual, but we |
| // compute its address relative to x4. For a normal AArch64->AArch64 |
| // call, x4 == sp on entry, but calls from an entry thunk can pass in a |
| // different address. |
| Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); |
| FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val, |
| DAG.getConstant(GPRSaveSize, DL, MVT::i64)); |
| } else { |
| FIN = DAG.getFrameIndex(GPRIdx, PtrVT); |
| } |
| |
| for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { |
| Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); |
| SDValue Store = |
| DAG.getStore(Val.getValue(1), DL, Val, FIN, |
| IsWin64 ? MachinePointerInfo::getFixedStack( |
| MF, GPRIdx, (i - FirstVariadicGPR) * 8) |
| : MachinePointerInfo::getStack(MF, i * 8)); |
| MemOps.push_back(Store); |
| FIN = |
| DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); |
| } |
| } |
| FuncInfo->setVarArgsGPRIndex(GPRIdx); |
| FuncInfo->setVarArgsGPRSize(GPRSaveSize); |
| |
| if (Subtarget->hasFPARMv8() && !IsWin64) { |
| auto FPRArgRegs = AArch64::getFPRArgRegs(); |
| const unsigned NumFPRArgRegs = FPRArgRegs.size(); |
| unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); |
| |
| unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); |
| int FPRIdx = 0; |
| if (FPRSaveSize != 0) { |
| FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); |
| |
| SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); |
| |
| for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { |
| Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); |
| |
| SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, |
| MachinePointerInfo::getStack(MF, i * 16)); |
| MemOps.push_back(Store); |
| FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, |
| DAG.getConstant(16, DL, PtrVT)); |
| } |
| } |
| FuncInfo->setVarArgsFPRIndex(FPRIdx); |
| FuncInfo->setVarArgsFPRSize(FPRSaveSize); |
| } |
| |
| if (!MemOps.empty()) { |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); |
| } |
| } |
| |
| /// LowerCallResult - Lower the result values of a call into the |
| /// appropriate copies out of appropriate physical registers. |
| SDValue AArch64TargetLowering::LowerCallResult( |
| SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, |
| SDValue ThisVal) const { |
| DenseMap<unsigned, SDValue> CopiedRegs; |
| // Copy all of the result registers out of their specified physreg. |
| for (unsigned i = 0; i != RVLocs.size(); ++i) { |
| CCValAssign VA = RVLocs[i]; |
| |
| // Pass 'this' value directly from the argument to return value, to avoid |
| // reg unit interference |
| if (i == 0 && isThisReturn) { |
| assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && |
| "unexpected return calling convention register assignment"); |
| InVals.push_back(ThisVal); |
| continue; |
| } |
| |
| // Avoid copying a physreg twice since RegAllocFast is incompetent and only |
| // allows one use of a physreg per block. |
| SDValue Val = CopiedRegs.lookup(VA.getLocReg()); |
| if (!Val) { |
| Val = |
| DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); |
| Chain = Val.getValue(1); |
| InGlue = Val.getValue(2); |
| CopiedRegs[VA.getLocReg()] = Val; |
| } |
| |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::BCvt: |
| Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); |
| break; |
| case CCValAssign::AExtUpper: |
| Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| [[fallthrough]]; |
| case CCValAssign::AExt: |
| [[fallthrough]]; |
| case CCValAssign::ZExt: |
| Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); |
| break; |
| } |
| |
| InVals.push_back(Val); |
| } |
| |
| return Chain; |
| } |
| |
| /// Return true if the calling convention is one that we can guarantee TCO for. |
| static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { |
| return (CC == CallingConv::Fast && GuaranteeTailCalls) || |
| CC == CallingConv::Tail || CC == CallingConv::SwiftTail; |
| } |
| |
| /// Return true if we might ever do TCO for calls with this calling convention. |
| static bool mayTailCallThisCC(CallingConv::ID CC) { |
| switch (CC) { |
| case CallingConv::C: |
| case CallingConv::AArch64_SVE_VectorCall: |
| case CallingConv::PreserveMost: |
| case CallingConv::PreserveAll: |
| case CallingConv::Swift: |
| case CallingConv::SwiftTail: |
| case CallingConv::Tail: |
| case CallingConv::Fast: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static void analyzeCallOperands(const AArch64TargetLowering &TLI, |
| const AArch64Subtarget *Subtarget, |
| const TargetLowering::CallLoweringInfo &CLI, |
| CCState &CCInfo) { |
| const SelectionDAG &DAG = CLI.DAG; |
| CallingConv::ID CalleeCC = CLI.CallConv; |
| bool IsVarArg = CLI.IsVarArg; |
| const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
| bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); |
| |
| unsigned NumArgs = Outs.size(); |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ArgVT = Outs[i].VT; |
| ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; |
| |
| bool UseVarArgCC = false; |
| if (IsVarArg) { |
| // On Windows, the fixed arguments in a vararg call are passed in GPRs |
| // too, so use the vararg CC to force them to integer registers. |
| if (IsCalleeWin64) { |
| UseVarArgCC = true; |
| } else { |
| UseVarArgCC = !Outs[i].IsFixed; |
| } |
| } |
| |
| if (!UseVarArgCC) { |
| // Get type of the original argument. |
| EVT ActualVT = |
| TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty, |
| /*AllowUnknown*/ true); |
| MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT; |
| // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. |
| if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) |
| ArgVT = MVT::i8; |
| else if (ActualMVT == MVT::i16) |
| ArgVT = MVT::i16; |
| } |
| |
| CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); |
| bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); |
| assert(!Res && "Call operand has unhandled type"); |
| (void)Res; |
| } |
| } |
| |
| bool AArch64TargetLowering::isEligibleForTailCallOptimization( |
| const CallLoweringInfo &CLI) const { |
| CallingConv::ID CalleeCC = CLI.CallConv; |
| if (!mayTailCallThisCC(CalleeCC)) |
| return false; |
| |
| SDValue Callee = CLI.Callee; |
| bool IsVarArg = CLI.IsVarArg; |
| const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
| const SmallVector<SDValue, 32> &OutVals = CLI.OutVals; |
| const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; |
| const SelectionDAG &DAG = CLI.DAG; |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const Function &CallerF = MF.getFunction(); |
| CallingConv::ID CallerCC = CallerF.getCallingConv(); |
| |
| // SME Streaming functions are not eligible for TCO as they may require |
| // the streaming mode or ZA to be restored after returning from the call. |
| SMEAttrs CallerAttrs(MF.getFunction()); |
| auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); |
| if (CallerAttrs.requiresSMChange(CalleeAttrs) || |
| CallerAttrs.requiresLazySave(CalleeAttrs) || |
| CallerAttrs.hasStreamingBody()) |
| return false; |
| |
| // Functions using the C or Fast calling convention that have an SVE signature |
| // preserve more registers and should assume the SVE_VectorCall CC. |
| // The check for matching callee-saved regs will determine whether it is |
| // eligible for TCO. |
| if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && |
| MF.getInfo<AArch64FunctionInfo>()->isSVECC()) |
| CallerCC = CallingConv::AArch64_SVE_VectorCall; |
| |
| bool CCMatch = CallerCC == CalleeCC; |
| |
| // When using the Windows calling convention on a non-windows OS, we want |
| // to back up and restore X18 in such functions; we can't do a tail call |
| // from those functions. |
| if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && |
| CalleeCC != CallingConv::Win64) |
| return false; |
| |
| // Byval parameters hand the function a pointer directly into the stack area |
| // we want to reuse during a tail call. Working around this *is* possible (see |
| // X86) but less efficient and uglier in LowerCall. |
| for (Function::const_arg_iterator i = CallerF.arg_begin(), |
| e = CallerF.arg_end(); |
| i != e; ++i) { |
| if (i->hasByValAttr()) |
| return false; |
| |
| // On Windows, "inreg" attributes signify non-aggregate indirect returns. |
| // In this case, it is necessary to save/restore X0 in the callee. Tail |
| // call opt interferes with this. So we disable tail call opt when the |
| // caller has an argument with "inreg" attribute. |
| |
| // FIXME: Check whether the callee also has an "inreg" argument. |
| if (i->hasInRegAttr()) |
| return false; |
| } |
| |
| if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) |
| return CCMatch; |
| |
| // Externally-defined functions with weak linkage should not be |
| // tail-called on AArch64 when the OS does not support dynamic |
| // pre-emption of symbols, as the AAELF spec requires normal calls |
| // to undefined weak functions to be replaced with a NOP or jump to the |
| // next instruction. The behaviour of branch instructions in this |
| // situation (as used for tail calls) is implementation-defined, so we |
| // cannot rely on the linker replacing the tail call with a return. |
| if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { |
| const GlobalValue *GV = G->getGlobal(); |
| const Triple &TT = getTargetMachine().getTargetTriple(); |
| if (GV->hasExternalWeakLinkage() && |
| (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) |
| return false; |
| } |
| |
| // Now we search for cases where we can use a tail call without changing the |
| // ABI. Sibcall is used in some places (particularly gcc) to refer to this |
| // concept. |
| |
| // I want anyone implementing a new calling convention to think long and hard |
| // about this assert. |
| assert((!IsVarArg || CalleeCC == CallingConv::C) && |
| "Unexpected variadic calling convention"); |
| |
| LLVMContext &C = *DAG.getContext(); |
| // Check that the call results are passed in the same way. |
| if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, |
| CCAssignFnForCall(CalleeCC, IsVarArg), |
| CCAssignFnForCall(CallerCC, IsVarArg))) |
| return false; |
| // The callee has to preserve all registers the caller needs to preserve. |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); |
| if (!CCMatch) { |
| const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); |
| if (Subtarget->hasCustomCallingConv()) { |
| TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); |
| TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); |
| } |
| if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) |
| return false; |
| } |
| |
| // Nothing more to check if the callee is taking no arguments |
| if (Outs.empty()) |
| return true; |
| |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C); |
| |
| analyzeCallOperands(*this, Subtarget, CLI, CCInfo); |
| |
| if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) { |
| // When we are musttail, additional checks have been done and we can safely ignore this check |
| // At least two cases here: if caller is fastcc then we can't have any |
| // memory arguments (we'd be expected to clean up the stack afterwards). If |
| // caller is C then we could potentially use its argument area. |
| |
| // FIXME: for now we take the most conservative of these in both cases: |
| // disallow all variadic memory operands. |
| for (const CCValAssign &ArgLoc : ArgLocs) |
| if (!ArgLoc.isRegLoc()) |
| return false; |
| } |
| |
| const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| // If any of the arguments is passed indirectly, it must be SVE, so the |
| // 'getBytesInStackArgArea' is not sufficient to determine whether we need to |
| // allocate space on the stack. That is why we determine this explicitly here |
| // the call cannot be a tailcall. |
| if (llvm::any_of(ArgLocs, [&](CCValAssign &A) { |
| assert((A.getLocInfo() != CCValAssign::Indirect || |
| A.getValVT().isScalableVector() || |
| Subtarget->isWindowsArm64EC()) && |
| "Expected value to be scalable"); |
| return A.getLocInfo() == CCValAssign::Indirect; |
| })) |
| return false; |
| |
| // If the stack arguments for this call do not fit into our own save area then |
| // the call cannot be made tail. |
| if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) |
| return false; |
| |
| const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) |
| return false; |
| |
| return true; |
| } |
| |
| SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, |
| SelectionDAG &DAG, |
| MachineFrameInfo &MFI, |
| int ClobberedFI) const { |
| SmallVector<SDValue, 8> ArgChains; |
| int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); |
| int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; |
| |
| // Include the original chain at the beginning of the list. When this is |
| // used by target LowerCall hooks, this helps legalize find the |
| // CALLSEQ_BEGIN node. |
| ArgChains.push_back(Chain); |
| |
| // Add a chain value for each stack argument corresponding |
| for (SDNode *U : DAG.getEntryNode().getNode()->uses()) |
| if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) |
| if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) |
| if (FI->getIndex() < 0) { |
| int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); |
| int64_t InLastByte = InFirstByte; |
| InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; |
| |
| if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || |
| (FirstByte <= InFirstByte && InFirstByte <= LastByte)) |
| ArgChains.push_back(SDValue(L, 1)); |
| } |
| |
| // Build a tokenfactor for all the chains. |
| return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); |
| } |
| |
| bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, |
| bool TailCallOpt) const { |
| return (CallCC == CallingConv::Fast && TailCallOpt) || |
| CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; |
| } |
| |
| // Check if the value is zero-extended from i1 to i8 |
| static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { |
| unsigned SizeInBits = Arg.getValueType().getSizeInBits(); |
| if (SizeInBits < 8) |
| return false; |
| |
| APInt RequredZero(SizeInBits, 0xFE); |
| KnownBits Bits = DAG.computeKnownBits(Arg, 4); |
| bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; |
| return ZExtBool; |
| } |
| |
| SDValue AArch64TargetLowering::changeStreamingMode( |
| SelectionDAG &DAG, SDLoc DL, bool Enable, |
| SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| FuncInfo->setHasStreamingModeChanges(true); |
| |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); |
| SDValue MSROp = |
| DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32); |
| |
| SDValue ExpectedSMVal = |
| DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64); |
| SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask}; |
| |
| if (InGlue) |
| Ops.push_back(InGlue); |
| |
| unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP; |
| return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); |
| } |
| |
| /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, |
| /// and add input and output parameter nodes. |
| SDValue |
| AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, |
| SmallVectorImpl<SDValue> &InVals) const { |
| SelectionDAG &DAG = CLI.DAG; |
| SDLoc &DL = CLI.DL; |
| SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
| SmallVector<SDValue, 32> &OutVals = CLI.OutVals; |
| SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; |
| SDValue Chain = CLI.Chain; |
| SDValue Callee = CLI.Callee; |
| bool &IsTailCall = CLI.IsTailCall; |
| CallingConv::ID &CallConv = CLI.CallConv; |
| bool IsVarArg = CLI.IsVarArg; |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFunction::CallSiteInfo CSInfo; |
| bool IsThisReturn = false; |
| |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
| bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType; |
| bool IsSibCall = false; |
| bool GuardWithBTI = false; |
| |
| if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) && |
| !Subtarget->noBTIAtReturnTwice()) { |
| GuardWithBTI = FuncInfo->branchTargetEnforcement(); |
| } |
| |
| // Analyze operands of the call, assigning locations to each operand. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| if (IsVarArg) { |
| unsigned NumArgs = Outs.size(); |
| |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) |
| report_fatal_error("Passing SVE types to variadic functions is " |
| "currently not supported"); |
| } |
| } |
| |
| analyzeCallOperands(*this, Subtarget, CLI, CCInfo); |
| |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| // Assign locations to each value returned by this call. |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, |
| *DAG.getContext()); |
| RetCCInfo.AnalyzeCallResult(Ins, RetCC); |
| |
| // Check callee args/returns for SVE registers and set calling convention |
| // accordingly. |
| if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { |
| auto HasSVERegLoc = [](CCValAssign &Loc) { |
| if (!Loc.isRegLoc()) |
| return false; |
| return AArch64::ZPRRegClass.contains(Loc.getLocReg()) || |
| AArch64::PPRRegClass.contains(Loc.getLocReg()); |
| }; |
| if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc)) |
| CallConv = CallingConv::AArch64_SVE_VectorCall; |
| } |
| |
| if (IsTailCall) { |
| // Check if it's really possible to do a tail call. |
| IsTailCall = isEligibleForTailCallOptimization(CLI); |
| |
| // A sibling call is one where we're under the usual C ABI and not planning |
| // to change that but can still do a tail call: |
| if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && |
| CallConv != CallingConv::SwiftTail) |
| IsSibCall = true; |
| |
| if (IsTailCall) |
| ++NumTailCalls; |
| } |
| |
| if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) |
| report_fatal_error("failed to perform tail call elimination on a call " |
| "site marked musttail"); |
| |
| // Get a count of how many bytes are to be pushed on the stack. |
| unsigned NumBytes = CCInfo.getStackSize(); |
| |
| if (IsSibCall) { |
| // Since we're not changing the ABI to make this a tail call, the memory |
| // operands are already available in the caller's incoming argument space. |
| NumBytes = 0; |
| } |
| |
| // FPDiff is the byte offset of the call's argument area from the callee's. |
| // Stores to callee stack arguments will be placed in FixedStackSlots offset |
| // by this amount for a tail call. In a sibling call it must be 0 because the |
| // caller will deallocate the entire stack and the callee still expects its |
| // arguments to begin at SP+0. Completely unused for non-tail calls. |
| int FPDiff = 0; |
| |
| if (IsTailCall && !IsSibCall) { |
| unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); |
| |
| // Since callee will pop argument stack as a tail call, we must keep the |
| // popped size 16-byte aligned. |
| NumBytes = alignTo(NumBytes, 16); |
| |
| // FPDiff will be negative if this tail call requires more space than we |
| // would automatically have in our incoming argument space. Positive if we |
| // can actually shrink the stack. |
| FPDiff = NumReusableBytes - NumBytes; |
| |
| // Update the required reserved area if this is the tail call requiring the |
| // most argument stack space. |
| if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) |
| FuncInfo->setTailCallReservedStack(-FPDiff); |
| |
| // The stack pointer must be 16-byte aligned at all times it's used for a |
| // memory operation, which in practice means at *all* times and in |
| // particular across call boundaries. Therefore our own arguments started at |
| // a 16-byte aligned SP and the delta applied for the tail call should |
| // satisfy the same constraint. |
| assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); |
| } |
| |
| // Determine whether we need any streaming mode changes. |
| SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); |
| if (CLI.CB) |
| CalleeAttrs = SMEAttrs(*CLI.CB); |
| else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) |
| CalleeAttrs = SMEAttrs(ES->getSymbol()); |
| |
| bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); |
| if (RequiresLazySave) { |
| SDValue NumZaSaveSlices; |
| if (!CalleeAttrs.preservesZA()) { |
| // Set up a lazy save mechanism by storing the runtime live slices |
| // (worst-case SVL*SVL) to the TPIDR2 stack object. |
| SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, |
| DAG.getConstant(1, DL, MVT::i32)); |
| NumZaSaveSlices = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); |
| } else if (CalleeAttrs.preservesZA()) { |
| NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64); |
| } |
| |
| unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); |
| MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); |
| SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, |
| DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); |
| SDValue NumZaSaveSlicesAddr = |
| DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, |
| DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); |
| Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, |
| MPI, MVT::i16); |
| Chain = DAG.getNode( |
| ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, |
| DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), |
| TPIDR2ObjAddr); |
| } |
| |
| SDValue PStateSM; |
| std::optional<bool> RequiresSMChange = |
| CallerAttrs.requiresSMChange(CalleeAttrs); |
| if (RequiresSMChange) |
| PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); |
| |
| // Adjust the stack pointer for the new arguments... |
| // These operations are automatically eliminated by the prolog/epilog pass |
| if (!IsSibCall) |
| Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); |
| |
| SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, |
| getPointerTy(DAG.getDataLayout())); |
| |
| SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
| SmallSet<unsigned, 8> RegsUsed; |
| SmallVector<SDValue, 8> MemOpChains; |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| |
| if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { |
| const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); |
| for (const auto &F : Forwards) { |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); |
| RegsToPass.emplace_back(F.PReg, Val); |
| } |
| } |
| |
| // Walk the register/memloc assignments, inserting copies/loads. |
| unsigned ExtraArgLocs = 0; |
| for (unsigned i = 0, e = Outs.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; |
| SDValue Arg = OutVals[i]; |
| ISD::ArgFlagsTy Flags = Outs[i].Flags; |
| |
| // Promote the value if needed. |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::SExt: |
| Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::ZExt: |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExt: |
| if (Outs[i].ArgVT == MVT::i1) { |
| // AAPCS requires i1 to be zero-extended to 8-bits by the caller. |
| // |
| // Check if we actually have to do this, because the value may |
| // already be zero-extended. |
| // |
| // We cannot just emit a (zext i8 (trunc (assert-zext i8))) |
| // and rely on DAGCombiner to fold this, because the following |
| // (anyext i32) is combined with (zext i8) in DAG.getNode: |
| // |
| // (ext (zext x)) -> (zext x) |
| // |
| // This will give us (zext i32), which we cannot remove, so |
| // try to check this beforehand. |
| if (!checkZExtBool(Arg, DAG)) { |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); |
| } |
| } |
| Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExtUpper: |
| assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); |
| Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); |
| Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| break; |
| case CCValAssign::BCvt: |
| Arg = DAG.getBitcast(VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::Trunc: |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| break; |
| case CCValAssign::FPExt: |
| Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::Indirect: |
| bool isScalable = VA.getValVT().isScalableVT(); |
| assert((isScalable || Subtarget->isWindowsArm64EC()) && |
| "Indirect arguments should be scalable on most subtargets"); |
| |
| uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue(); |
| uint64_t PartSize = StoreSize; |
| unsigned NumParts = 1; |
| if (Outs[i].Flags.isInConsecutiveRegs()) { |
| assert(!Outs[i].Flags.isInConsecutiveRegsLast()); |
| while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) |
| ++NumParts; |
| StoreSize *= NumParts; |
| } |
| |
| Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); |
| Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| int FI = MFI.CreateStackObject(StoreSize, Alignment, false); |
| if (isScalable) |
| MFI.setStackID(FI, TargetStackID::ScalableVector); |
| |
| MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); |
| SDValue Ptr = DAG.getFrameIndex( |
| FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); |
| SDValue SpillSlot = Ptr; |
| |
| // Ensure we generate all stores for each tuple part, whilst updating the |
| // pointer after each store correctly using vscale. |
| while (NumParts) { |
| SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); |
| MemOpChains.push_back(Store); |
| |
| NumParts--; |
| if (NumParts > 0) { |
| SDValue BytesIncrement; |
| if (isScalable) { |
| BytesIncrement = DAG.getVScale( |
| DL, Ptr.getValueType(), |
| APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); |
| } else { |
| BytesIncrement = DAG.getConstant( |
| APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, |
| Ptr.getValueType()); |
| } |
| SDNodeFlags Flags; |
| Flags.setNoUnsignedWrap(true); |
| |
| MPI = MachinePointerInfo(MPI.getAddrSpace()); |
| Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, |
| BytesIncrement, Flags); |
| ExtraArgLocs++; |
| i++; |
| } |
| } |
| |
| Arg = SpillSlot; |
| break; |
| } |
| |
| if (VA.isRegLoc()) { |
| if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && |
| Outs[0].VT == MVT::i64) { |
| assert(VA.getLocVT() == MVT::i64 && |
| "unexpected calling convention register assignment"); |
| assert(!Ins.empty() && Ins[0].VT == MVT::i64 && |
| "unexpected use of 'returned'"); |
| IsThisReturn = true; |
| } |
| if (RegsUsed.count(VA.getLocReg())) { |
| // If this register has already been used then we're trying to pack |
| // parts of an [N x i32] into an X-register. The extension type will |
| // take care of putting the two halves in the right place but we have to |
| // combine them. |
| SDValue &Bits = |
| llvm::find_if(RegsToPass, |
| [=](const std::pair<unsigned, SDValue> &Elt) { |
| return Elt.first == VA.getLocReg(); |
| }) |
| ->second; |
| Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); |
| // Call site info is used for function's parameter entry value |
| // tracking. For now we track only simple cases when parameter |
| // is transferred through whole register. |
| llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { |
| return ArgReg.Reg == VA.getLocReg(); |
| }); |
| } else { |
| RegsToPass.emplace_back(VA.getLocReg(), Arg); |
| RegsUsed.insert(VA.getLocReg()); |
| const TargetOptions &Options = DAG.getTarget().Options; |
| if (Options.EmitCallSiteInfo) |
| CSInfo.emplace_back(VA.getLocReg(), i); |
| } |
| } else { |
| assert(VA.isMemLoc()); |
| |
| SDValue DstAddr; |
| MachinePointerInfo DstInfo; |
| |
| // FIXME: This works on big-endian for composite byvals, which are the |
| // common case. It should also work for fundamental types too. |
| uint32_t BEAlign = 0; |
| unsigned OpSize; |
| if (VA.getLocInfo() == CCValAssign::Indirect || |
| VA.getLocInfo() == CCValAssign::Trunc) |
| OpSize = VA.getLocVT().getFixedSizeInBits(); |
| else |
| OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 |
| : VA.getValVT().getSizeInBits(); |
| OpSize = (OpSize + 7) / 8; |
| if (!Subtarget->isLittleEndian() && !Flags.isByVal() && |
| !Flags.isInConsecutiveRegs()) { |
| if (OpSize < 8) |
| BEAlign = 8 - OpSize; |
| } |
| unsigned LocMemOffset = VA.getLocMemOffset(); |
| int32_t Offset = LocMemOffset + BEAlign; |
| SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); |
| PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); |
| |
| if (IsTailCall) { |
| Offset = Offset + FPDiff; |
| int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); |
| |
| DstAddr = DAG.getFrameIndex(FI, PtrVT); |
| DstInfo = MachinePointerInfo::getFixedStack(MF, FI); |
| |
| // Make sure any stack arguments overlapping with where we're storing |
| // are loaded before this eventual operation. Otherwise they'll be |
| // clobbered. |
| Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); |
| } else { |
| SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); |
| |
| DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); |
| DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); |
| } |
| |
| if (Outs[i].Flags.isByVal()) { |
| SDValue SizeNode = |
| DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); |
| SDValue Cpy = DAG.getMemcpy( |
| Chain, DL, DstAddr, Arg, SizeNode, |
| Outs[i].Flags.getNonZeroByValAlign(), |
| /*isVol = */ false, /*AlwaysInline = */ false, |
| /*isTailCall = */ false, DstInfo, MachinePointerInfo()); |
| |
| MemOpChains.push_back(Cpy); |
| } else { |
| // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already |
| // promoted to a legal register type i32, we should truncate Arg back to |
| // i1/i8/i16. |
| if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || |
| VA.getValVT() == MVT::i16) |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); |
| |
| SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); |
| MemOpChains.push_back(Store); |
| } |
| } |
| } |
| |
| if (IsVarArg && Subtarget->isWindowsArm64EC()) { |
| // For vararg calls, the Arm64EC ABI requires values in x4 and x5 |
| // describing the argument list. x4 contains the address of the |
| // first stack parameter. x5 contains the size in bytes of all parameters |
| // passed on the stack. |
| RegsToPass.emplace_back(AArch64::X4, StackPtr); |
| RegsToPass.emplace_back(AArch64::X5, |
| DAG.getConstant(NumBytes, DL, MVT::i64)); |
| } |
| |
| if (!MemOpChains.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); |
| |
| SDValue InGlue; |
| if (RequiresSMChange) { |
| SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, |
| InGlue, PStateSM, true); |
| Chain = NewChain.getValue(0); |
| InGlue = NewChain.getValue(1); |
| } |
| |
| // Build a sequence of copy-to-reg nodes chained together with token chain |
| // and flag operands which copy the outgoing args into the appropriate regs. |
| for (auto &RegToPass : RegsToPass) { |
| Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, |
| RegToPass.second, InGlue); |
| InGlue = Chain.getValue(1); |
| } |
| |
| // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every |
| // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol |
| // node so that legalize doesn't hack it. |
| if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { |
| auto GV = G->getGlobal(); |
| unsigned OpFlags = |
| Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); |
| if (OpFlags & AArch64II::MO_GOT) { |
| Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); |
| Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); |
| } else { |
| const GlobalValue *GV = G->getGlobal(); |
| Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); |
| } |
| } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| Subtarget->isTargetMachO()) { |
| const char *Sym = S->getSymbol(); |
| Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); |
| Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); |
| } else { |
| const char *Sym = S->getSymbol(); |
| Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); |
| } |
| } |
| |
| // We don't usually want to end the call-sequence here because we would tidy |
| // the frame up *after* the call, however in the ABI-changing tail-call case |
| // we've carefully laid out the parameters so that when sp is reset they'll be |
| // in the correct location. |
| if (IsTailCall && !IsSibCall) { |
| Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); |
| InGlue = Chain.getValue(1); |
| } |
| |
| std::vector<SDValue> Ops; |
| Ops.push_back(Chain); |
| Ops.push_back(Callee); |
| |
| if (IsTailCall) { |
| // Each tail call may have to adjust the stack by a different amount, so |
| // this information must travel along with the operation for eventual |
| // consumption by emitEpilogue. |
| Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); |
| } |
| |
| // Add argument registers to the end of the list so that they are known live |
| // into the call. |
| for (auto &RegToPass : RegsToPass) |
| Ops.push_back(DAG.getRegister(RegToPass.first, |
| RegToPass.second.getValueType())); |
| |
| // Add a register mask operand representing the call-preserved registers. |
| const uint32_t *Mask; |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| if (IsThisReturn) { |
| // For 'this' returns, use the X0-preserving mask if applicable |
| Mask = TRI->getThisReturnPreservedMask(MF, CallConv); |
| if (!Mask) { |
| IsThisReturn = false; |
| Mask = TRI->getCallPreservedMask(MF, CallConv); |
| } |
| } else |
| Mask = TRI->getCallPreservedMask(MF, CallConv); |
| |
| if (Subtarget->hasCustomCallingConv()) |
| TRI->UpdateCustomCallPreservedMask(MF, &Mask); |
| |
| if (TRI->isAnyArgRegReserved(MF)) |
| TRI->emitReservedArgRegCallError(MF); |
| |
| assert(Mask && "Missing call preserved mask for calling convention"); |
| Ops.push_back(DAG.getRegisterMask(Mask)); |
| |
| if (InGlue.getNode()) |
| Ops.push_back(InGlue); |
| |
| SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
| |
| // If we're doing a tall call, use a TC_RETURN here rather than an |
| // actual call instruction. |
| if (IsTailCall) { |
| MF.getFrameInfo().setHasTailCall(); |
| SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); |
| |
| if (IsCFICall) |
| Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); |
| |
| DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); |
| DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); |
| return Ret; |
| } |
| |
| unsigned CallOpc = AArch64ISD::CALL; |
| // Calls with operand bundle "clang.arc.attachedcall" are special. They should |
| // be expanded to the call, directly followed by a special marker sequence and |
| // a call to an ObjC library function. Use CALL_RVMARKER to do that. |
| if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { |
| assert(!IsTailCall && |
| "tail calls cannot be marked with clang.arc.attachedcall"); |
| CallOpc = AArch64ISD::CALL_RVMARKER; |
| |
| // Add a target global address for the retainRV/claimRV runtime function |
| // just before the call target. |
| Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); |
| auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); |
| Ops.insert(Ops.begin() + 1, GA); |
| } else if (GuardWithBTI) |
| CallOpc = AArch64ISD::CALL_BTI; |
| |
| // Returns a chain and a flag for retval copy to use. |
| Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); |
| |
| if (IsCFICall) |
| Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); |
| |
| DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); |
| InGlue = Chain.getValue(1); |
| DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); |
| |
| uint64_t CalleePopBytes = |
| DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; |
| |
| Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL); |
| InGlue = Chain.getValue(1); |
| |
| // Handle result values, copying them out of physregs into vregs that we |
| // return. |
| SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, |
| DL, DAG, InVals, IsThisReturn, |
| IsThisReturn ? OutVals[0] : SDValue()); |
| |
| if (!Ins.empty()) |
| InGlue = Result.getValue(Result->getNumValues() - 1); |
| |
| if (RequiresSMChange) { |
| assert(PStateSM && "Expected a PStateSM to be set"); |
| Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue, |
| PStateSM, false); |
| } |
| |
| if (RequiresLazySave) { |
| if (!CalleeAttrs.preservesZA()) { |
| // Unconditionally resume ZA. |
| Result = DAG.getNode( |
| AArch64ISD::SMSTART, DL, MVT::Other, Result, |
| DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), |
| DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); |
| |
| // Conditionally restore the lazy save using a pseudo node. |
| unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); |
| SDValue RegMask = DAG.getRegisterMask( |
| TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); |
| SDValue RestoreRoutine = DAG.getTargetExternalSymbol( |
| "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); |
| SDValue TPIDR2_EL0 = DAG.getNode( |
| ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, |
| DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); |
| |
| // Copy the address of the TPIDR2 block into X0 before 'calling' the |
| // RESTORE_ZA pseudo. |
| SDValue Glue; |
| SDValue TPIDR2Block = DAG.getFrameIndex( |
| FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); |
| Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); |
| Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, |
| {Result, TPIDR2_EL0, |
| DAG.getRegister(AArch64::X0, MVT::i64), |
| RestoreRoutine, RegMask, Result.getValue(1)}); |
| } |
| // Finally reset the TPIDR2_EL0 register to 0. |
| Result = DAG.getNode( |
| ISD::INTRINSIC_VOID, DL, MVT::Other, Result, |
| DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| |
| if (RequiresSMChange || RequiresLazySave) { |
| for (unsigned I = 0; I < InVals.size(); ++I) { |
| // The smstart/smstop is chained as part of the call, but when the |
| // resulting chain is discarded (which happens when the call is not part |
| // of a chain, e.g. a call to @llvm.cos()), we need to ensure the |
| // smstart/smstop is chained to the result value. We can do that by doing |
| // a vreg -> vreg copy. |
| Register Reg = MF.getRegInfo().createVirtualRegister( |
| getRegClassFor(InVals[I].getValueType().getSimpleVT())); |
| SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]); |
| InVals[I] = DAG.getCopyFromReg(X, DL, Reg, |
| InVals[I].getValueType()); |
| } |
| } |
| |
| return Result; |
| } |
| |
| bool AArch64TargetLowering::CanLowerReturn( |
| CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); |
| return CCInfo.CheckReturn(Outs, RetCC); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
| bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SDLoc &DL, SelectionDAG &DAG) const { |
| auto &MF = DAG.getMachineFunction(); |
| auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); |
| CCInfo.AnalyzeReturn(Outs, RetCC); |
| |
| // Copy the result values into the output registers. |
| SDValue Glue; |
| SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; |
| SmallSet<unsigned, 4> RegsUsed; |
| for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); |
| ++i, ++realRVLocIdx) { |
| CCValAssign &VA = RVLocs[i]; |
| assert(VA.isRegLoc() && "Can only return in registers!"); |
| SDValue Arg = OutVals[realRVLocIdx]; |
| |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| if (Outs[i].ArgVT == MVT::i1) { |
| // AAPCS requires i1 to be zero-extended to i8 by the producer of the |
| // value. This is strictly redundant on Darwin (which uses "zeroext |
| // i1"), but will be optimised out before ISel. |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); |
| } |
| break; |
| case CCValAssign::BCvt: |
| Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExt: |
| case CCValAssign::ZExt: |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| break; |
| case CCValAssign::AExtUpper: |
| assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| break; |
| } |
| |
| if (RegsUsed.count(VA.getLocReg())) { |
| SDValue &Bits = |
| llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { |
| return Elt.first == VA.getLocReg(); |
| })->second; |
| Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); |
| } else { |
| RetVals.emplace_back(VA.getLocReg(), Arg); |
| RegsUsed.insert(VA.getLocReg()); |
| } |
| } |
| |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| |
| // Emit SMSTOP before returning from a locally streaming function |
| SMEAttrs FuncAttrs(MF.getFunction()); |
| if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { |
| Chain = changeStreamingMode( |
| DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), |
| DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); |
| Glue = Chain.getValue(1); |
| } |
| |
| SmallVector<SDValue, 4> RetOps(1, Chain); |
| for (auto &RetVal : RetVals) { |
| Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue); |
| Glue = Chain.getValue(1); |
| RetOps.push_back( |
| DAG.getRegister(RetVal.first, RetVal.second.getValueType())); |
| } |
| |
| // Windows AArch64 ABIs require that for returning structs by value we copy |
| // the sret argument into X0 for the return. |
| // We saved the argument into a virtual register in the entry block, |
| // so now we copy the value out and into X0. |
| if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { |
| SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, |
| getPointerTy(MF.getDataLayout())); |
| |
| unsigned RetValReg = AArch64::X0; |
| Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue); |
| Glue = Chain.getValue(1); |
| |
| RetOps.push_back( |
| DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); |
| } |
| |
| const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); |
| if (I) { |
| for (; *I; ++I) { |
| if (AArch64::GPR64RegClass.contains(*I)) |
| RetOps.push_back(DAG.getRegister(*I, MVT::i64)); |
| else if (AArch64::FPR64RegClass.contains(*I)) |
| RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); |
| else |
| llvm_unreachable("Unexpected register class in CSRsViaCopy!"); |
| } |
| } |
| |
| RetOps[0] = Chain; // Update chain. |
| |
| // Add the glue if we have it. |
| if (Glue.getNode()) |
| RetOps.push_back(Glue); |
| |
| return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Other Lowering Code |
| //===----------------------------------------------------------------------===// |
| |
| SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, |
| N->getOffset(), Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), |
| N->getOffset(), Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); |
| } |
| |
| // (loadGOT sym) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); |
| // FIXME: Once remat is capable of dealing with instructions with register |
| // operands, expand this into two nodes instead of using a wrapper node. |
| return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); |
| } |
| |
| // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| const unsigned char MO_NC = AArch64II::MO_NC; |
| return DAG.getNode( |
| AArch64ISD::WrapperLarge, DL, Ty, |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); |
| } |
| |
| // (addlow (adrp %hi(sym)) %lo(sym)) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); |
| SDValue Lo = getTargetNode(N, Ty, DAG, |
| AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); |
| SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); |
| return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); |
| } |
| |
| // (adr sym) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue Sym = getTargetNode(N, Ty, DAG, Flags); |
| return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); |
| } |
| |
| SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); |
| const GlobalValue *GV = GN->getGlobal(); |
| unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); |
| |
| if (OpFlags != AArch64II::MO_NO_FLAG) |
| assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && |
| "unexpected offset in global node"); |
| |
| // This also catches the large code model case for Darwin, and tiny code |
| // model with got relocations. |
| if ((OpFlags & AArch64II::MO_GOT) != 0) { |
| return getGOT(GN, DAG, OpFlags); |
| } |
| |
| SDValue Result; |
| if (getTargetMachine().getCodeModel() == CodeModel::Large) { |
| Result = getAddrLarge(GN, DAG, OpFlags); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| Result = getAddrTiny(GN, DAG, OpFlags); |
| } else { |
| Result = getAddr(GN, DAG, OpFlags); |
| } |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(GN); |
| if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX | |
| AArch64II::MO_COFFSTUB)) |
| Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, |
| MachinePointerInfo::getGOT(DAG.getMachineFunction())); |
| return Result; |
| } |
| |
| /// Convert a TLS address reference into the correct sequence of loads |
| /// and calls to compute the variable's address (for Darwin, currently) and |
| /// return an SDValue containing the final node. |
| |
| /// Darwin only has one TLS scheme which must be capable of dealing with the |
| /// fully general situation, in the worst case. This means: |
| /// + "extern __thread" declaration. |
| /// + Defined in a possibly unknown dynamic library. |
| /// |
| /// The general system is that each __thread variable has a [3 x i64] descriptor |
| /// which contains information used by the runtime to calculate the address. The |
| /// only part of this the compiler needs to know about is the first xword, which |
| /// contains a function pointer that must be called with the address of the |
| /// entire descriptor in "x0". |
| /// |
| /// Since this descriptor may be in a different unit, in general even the |
| /// descriptor must be accessed via an indirect load. The "ideal" code sequence |
| /// is: |
| /// adrp x0, _var@TLVPPAGE |
| /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor |
| /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, |
| /// ; the function pointer |
| /// blr x1 ; Uses descriptor address in x0 |
| /// ; Address of _var is now in x0. |
| /// |
| /// If the address of _var's descriptor *is* known to the linker, then it can |
| /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for |
| /// a slight efficiency gain. |
| SDValue |
| AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetDarwin() && |
| "This function expects a Darwin target"); |
| |
| SDLoc DL(Op); |
| MVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); |
| |
| SDValue TLVPAddr = |
| DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); |
| |
| // The first entry in the descriptor is a function pointer that we must call |
| // to obtain the address of the variable. |
| SDValue Chain = DAG.getEntryNode(); |
| SDValue FuncTLVGet = DAG.getLoad( |
| PtrMemVT, DL, Chain, DescAddr, |
| MachinePointerInfo::getGOT(DAG.getMachineFunction()), |
| Align(PtrMemVT.getSizeInBits() / 8), |
| MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); |
| Chain = FuncTLVGet.getValue(1); |
| |
| // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. |
| FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); |
| |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| MFI.setAdjustsStack(true); |
| |
| // TLS calls preserve all registers except those that absolutely must be |
| // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be |
| // silly). |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const uint32_t *Mask = TRI->getTLSCallPreservedMask(); |
| if (Subtarget->hasCustomCallingConv()) |
| TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); |
| |
| // Finally, we can make the call. This is just a degenerate version of a |
| // normal AArch64 call node: x0 takes the address of the descriptor, and |
| // returns the address of the variable in this thread. |
| Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); |
| Chain = |
| DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), |
| Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), |
| DAG.getRegisterMask(Mask), Chain.getValue(1)); |
| return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); |
| } |
| |
| /// Convert a thread-local variable reference into a sequence of instructions to |
| /// compute the variable's address for the local exec TLS model of ELF targets. |
| /// The sequence depends on the maximum TLS area size. |
| SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, |
| SDValue ThreadBase, |
| const SDLoc &DL, |
| SelectionDAG &DAG) const { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDValue TPOff, Addr; |
| |
| switch (DAG.getTarget().Options.TLSSize) { |
| default: |
| llvm_unreachable("Unexpected TLS size"); |
| |
| case 12: { |
| // mrs x0, TPIDR_EL0 |
| // add x0, x0, :tprel_lo12:a |
| SDValue Var = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); |
| return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, |
| Var, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } |
| |
| case 24: { |
| // mrs x0, TPIDR_EL0 |
| // add x0, x0, :tprel_hi12:a |
| // add x0, x0, :tprel_lo12_nc:a |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, |
| HiVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, |
| LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } |
| |
| case 32: { |
| // mrs x1, TPIDR_EL0 |
| // movz x0, #:tprel_g1:a |
| // movk x0, #:tprel_g0_nc:a |
| // add x0, x1, x0 |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, |
| DAG.getTargetConstant(16, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| |
| case 48: { |
| // mrs x1, TPIDR_EL0 |
| // movz x0, #:tprel_g2:a |
| // movk x0, #:tprel_g1_nc:a |
| // movk x0, #:tprel_g0_nc:a |
| // add x0, x1, x0 |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); |
| SDValue MiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, |
| DAG.getTargetConstant(32, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, |
| DAG.getTargetConstant(16, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| } |
| } |
| |
| /// When accessing thread-local variables under either the general-dynamic or |
| /// local-dynamic system, we make a "TLS-descriptor" call. The variable will |
| /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry |
| /// is a function pointer to carry out the resolution. |
| /// |
| /// The sequence is: |
| /// adrp x0, :tlsdesc:var |
| /// ldr x1, [x0, #:tlsdesc_lo12:var] |
| /// add x0, x0, #:tlsdesc_lo12:var |
| /// .tlsdesccall var |
| /// blr x1 |
| /// (TPIDR_EL0 offset now in x0) |
| /// |
| /// The above sequence must be produced unscheduled, to enable the linker to |
| /// optimize/relax this sequence. |
| /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the |
| /// above sequence, and expanded really late in the compilation flow, to ensure |
| /// the sequence is produced as per above. |
| SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, |
| const SDLoc &DL, |
| SelectionDAG &DAG) const { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| |
| SDValue Chain = DAG.getEntryNode(); |
| SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
| |
| Chain = |
| DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); |
| SDValue Glue = Chain.getValue(1); |
| |
| return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetELF() && "This function expects an ELF target"); |
| |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| |
| TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); |
| |
| if (!EnableAArch64ELFLocalDynamicTLSGeneration) { |
| if (Model == TLSModel::LocalDynamic) |
| Model = TLSModel::GeneralDynamic; |
| } |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| Model != TLSModel::LocalExec) |
| report_fatal_error("ELF TLS only supported in small memory model or " |
| "in local exec TLS model"); |
| // Different choices can be made for the maximum size of the TLS area for a |
| // module. For the small address model, the default TLS size is 16MiB and the |
| // maximum TLS size is 4GiB. |
| // FIXME: add tiny and large code model support for TLS access models other |
| // than local exec. We currently generate the same code as small for tiny, |
| // which may be larger than needed. |
| |
| SDValue TPOff; |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| const GlobalValue *GV = GA->getGlobal(); |
| |
| SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); |
| |
| if (Model == TLSModel::LocalExec) { |
| return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); |
| } else if (Model == TLSModel::InitialExec) { |
| TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); |
| } else if (Model == TLSModel::LocalDynamic) { |
| // Local-dynamic accesses proceed in two phases. A general-dynamic TLS |
| // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate |
| // the beginning of the module's TLS region, followed by a DTPREL offset |
| // calculation. |
| |
| // These accesses will need deduplicating if there's more than one. |
| AArch64FunctionInfo *MFI = |
| DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| MFI->incNumLocalDynamicTLSAccesses(); |
| |
| // The call needs a relocation too for linker relaxation. It doesn't make |
| // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of |
| // the address. |
| SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, |
| AArch64II::MO_TLS); |
| |
| // Now we can calculate the offset from TPIDR_EL0 to this module's |
| // thread-local area. |
| TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); |
| |
| // Now use :dtprel_whatever: operations to calculate this variable's offset |
| // in its thread-storage area. |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, MVT::i64, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| |
| TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } else if (Model == TLSModel::GeneralDynamic) { |
| // The call needs a relocation too for linker relaxation. It doesn't make |
| // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of |
| // the address. |
| SDValue SymAddr = |
| DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| |
| // Finally we can make a call to calculate the offset from tpidr_el0. |
| TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); |
| } else |
| llvm_unreachable("Unsupported ELF TLS access model"); |
| |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); |
| |
| SDValue Chain = DAG.getEntryNode(); |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| |
| SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); |
| |
| // Load the ThreadLocalStoragePointer from the TEB |
| // A pointer to the TLS array is located at offset 0x58 from the TEB. |
| SDValue TLSArray = |
| DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); |
| TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); |
| Chain = TLSArray.getValue(1); |
| |
| // Load the TLS index from the C runtime; |
| // This does the same as getAddr(), but without having a GlobalAddressSDNode. |
| // This also does the same as LOADgot, but using a generic i32 load, |
| // while LOADgot only loads i64. |
| SDValue TLSIndexHi = |
| DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); |
| SDValue TLSIndexLo = DAG.getTargetExternalSymbol( |
| "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); |
| SDValue TLSIndex = |
| DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); |
| TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); |
| Chain = TLSIndex.getValue(1); |
| |
| // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 |
| // offset into the TLSArray. |
| TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); |
| SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, |
| DAG.getConstant(3, DL, PtrVT)); |
| SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, |
| DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), |
| MachinePointerInfo()); |
| Chain = TLS.getValue(1); |
| |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| const GlobalValue *GV = GA->getGlobal(); |
| SDValue TGAHi = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue TGALo = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| |
| // Add the offset from the start of the .tls section (section base). |
| SDValue Addr = |
| SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); |
| return Addr; |
| } |
| |
| SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| if (DAG.getTarget().useEmulatedTLS()) |
| return LowerToTLSEmulatedModel(GA, DAG); |
| |
| if (Subtarget->isTargetDarwin()) |
| return LowerDarwinGlobalTLSAddress(Op, DAG); |
| if (Subtarget->isTargetELF()) |
| return LowerELFGlobalTLSAddress(Op, DAG); |
| if (Subtarget->isTargetWindows()) |
| return LowerWindowsGlobalTLSAddress(Op, DAG); |
| |
| llvm_unreachable("Unexpected platform trying to use TLS"); |
| } |
| |
| // Looks through \param Val to determine the bit that can be used to |
| // check the sign of the value. It returns the unextended value and |
| // the sign bit position. |
| std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { |
| if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) |
| return {Val.getOperand(0), |
| cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - |
| 1}; |
| |
| if (Val.getOpcode() == ISD::SIGN_EXTEND) |
| return {Val.getOperand(0), |
| Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; |
| |
| return {Val, Val.getValueSizeInBits() - 1}; |
| } |
| |
| SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { |
| SDValue Chain = Op.getOperand(0); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); |
| SDValue LHS = Op.getOperand(2); |
| SDValue RHS = Op.getOperand(3); |
| SDValue Dest = Op.getOperand(4); |
| SDLoc dl(Op); |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions |
| // will not be produced, as they are conditional branch instructions that do |
| // not set flags. |
| bool ProduceNonFlagSettingCondBr = |
| !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); |
| |
| // Handle f128 first, since lowering it will result in comparing the return |
| // value of a libcall against zero, which is just what the rest of LowerBR_CC |
| // is expecting to deal with. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); |
| |
| // If softenSetCCOperands returned a scalar, we need to compare the result |
| // against zero to select between true and false values. |
| if (!RHS.getNode()) { |
| RHS = DAG.getConstant(0, dl, LHS.getValueType()); |
| CC = ISD::SETNE; |
| } |
| } |
| |
| // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch |
| // instruction. |
| if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && |
| (CC == ISD::SETEQ || CC == ISD::SETNE)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) |
| return SDValue(); |
| |
| // The actual operation with overflow check. |
| AArch64CC::CondCode OFCC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); |
| |
| if (CC == ISD::SETNE) |
| OFCC = getInvertedCondCode(OFCC); |
| SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); |
| |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, |
| Overflow); |
| } |
| |
| if (LHS.getValueType().isInteger()) { |
| assert((LHS.getValueType() == RHS.getValueType()) && |
| (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); |
| |
| // If the RHS of the comparison is zero, we can potentially fold this |
| // to a specialized branch. |
| const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); |
| if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { |
| if (CC == ISD::SETEQ) { |
| // See if we can use a TBZ to fold in an AND as well. |
| // TBZ has a smaller branch displacement than CBZ. If the offset is |
| // out of bounds, a late MI-layer pass rewrites branches. |
| // 403.gcc is an example that hits this case. |
| if (LHS.getOpcode() == ISD::AND && |
| isa<ConstantSDNode>(LHS.getOperand(1)) && |
| isPowerOf2_64(LHS.getConstantOperandVal(1))) { |
| SDValue Test = LHS.getOperand(0); |
| uint64_t Mask = LHS.getConstantOperandVal(1); |
| return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, |
| DAG.getConstant(Log2_64(Mask), dl, MVT::i64), |
| Dest); |
| } |
| |
| return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); |
| } else if (CC == ISD::SETNE) { |
| // See if we can use a TBZ to fold in an AND as well. |
| // TBZ has a smaller branch displacement than CBZ. If the offset is |
| // out of bounds, a late MI-layer pass rewrites branches. |
| // 403.gcc is an example that hits this case. |
| if (LHS.getOpcode() == ISD::AND && |
| isa<ConstantSDNode>(LHS.getOperand(1)) && |
| isPowerOf2_64(LHS.getConstantOperandVal(1))) { |
| SDValue Test = LHS.getOperand(0); |
| uint64_t Mask = LHS.getConstantOperandVal(1); |
| return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, |
| DAG.getConstant(Log2_64(Mask), dl, MVT::i64), |
| Dest); |
| } |
| |
| return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); |
| } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { |
| // Don't combine AND since emitComparison converts the AND to an ANDS |
| // (a.k.a. TST) and the test in the test bit and branch instruction |
| // becomes redundant. This would also increase register pressure. |
| uint64_t SignBitPos; |
| std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); |
| return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, |
| DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); |
| } |
| } |
| if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && |
| LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { |
| // Don't combine AND since emitComparison converts the AND to an ANDS |
| // (a.k.a. TST) and the test in the test bit and branch instruction |
| // becomes redundant. This would also increase register pressure. |
| uint64_t SignBitPos; |
| std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); |
| return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, |
| DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); |
| } |
| |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, |
| Cmp); |
| } |
| |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || |
| LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); |
| |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally |
| // clean. Some of them require two branches to implement. |
| SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue BR1 = |
| DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); |
| if (CC2 != AArch64CC::AL) { |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, |
| Cmp); |
| } |
| |
| return BR1; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (!Subtarget->hasNEON()) |
| return SDValue(); |
| |
| EVT VT = Op.getValueType(); |
| EVT IntVT = VT.changeTypeToInteger(); |
| SDLoc DL(Op); |
| |
| SDValue In1 = Op.getOperand(0); |
| SDValue In2 = Op.getOperand(1); |
| EVT SrcVT = In2.getValueType(); |
| |
| if (!SrcVT.bitsEq(VT)) |
| In2 = DAG.getFPExtendOrRound(In2, DL, VT); |
| |
| if (VT.isScalableVector()) |
| IntVT = |
| getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); |
| |
| if (VT.isFixedLengthVector() && |
| useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { |
| EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); |
| |
| In1 = convertToScalableVector(DAG, ContainerVT, In1); |
| In2 = convertToScalableVector(DAG, ContainerVT, In2); |
| |
| SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2); |
| return convertFromScalableVector(DAG, VT, Res); |
| } |
| |
| auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { |
| if (VT.isScalableVector()) |
| return getSVESafeBitCast(VT, Op, DAG); |
| |
| return DAG.getBitcast(VT, Op); |
| }; |
| |
| SDValue VecVal1, VecVal2; |
| EVT VecVT; |
| auto SetVecVal = [&](int Idx = -1) { |
| if (!VT.isVector()) { |
| VecVal1 = |
| DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); |
| VecVal2 = |
| DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); |
| } else { |
| VecVal1 = BitCast(VecVT, In1, DAG); |
| VecVal2 = BitCast(VecVT, In2, DAG); |
| } |
| }; |
| if (VT.isVector()) { |
| VecVT = IntVT; |
| SetVecVal(); |
| } else if (VT == MVT::f64) { |
| VecVT = MVT::v2i64; |
| SetVecVal(AArch64::dsub); |
| } else if (VT == MVT::f32) { |
| VecVT = MVT::v4i32; |
| SetVecVal(AArch64::ssub); |
| } else if (VT == MVT::f16) { |
| VecVT = MVT::v8i16; |
| SetVecVal(AArch64::hsub); |
| } else { |
| llvm_unreachable("Invalid type for copysign!"); |
| } |
| |
| unsigned BitWidth = In1.getScalarValueSizeInBits(); |
| SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); |
| |
| // We want to materialize a mask with every bit but the high bit set, but the |
| // AdvSIMD immediate moves cannot materialize that in a single instruction for |
| // 64-bit elements. Instead, materialize all bits set and then negate that. |
| if (VT == MVT::f64 || VT == MVT::v2f64) { |
| SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT); |
| SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); |
| SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); |
| SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); |
| } |
| |
| SDValue BSP = |
| DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); |
| if (VT == MVT::f16) |
| return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); |
| if (VT == MVT::f32) |
| return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); |
| if (VT == MVT::f64) |
| return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); |
| |
| return BitCast(VT, BSP, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (DAG.getMachineFunction().getFunction().hasFnAttribute( |
| Attribute::NoImplicitFloat)) |
| return SDValue(); |
| |
| if (!Subtarget->hasNEON()) |
| return SDValue(); |
| |
| bool IsParity = Op.getOpcode() == ISD::PARITY; |
| SDValue Val = Op.getOperand(0); |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| |
| // for i32, general parity function using EORs is more efficient compared to |
| // using floating point |
| if (VT == MVT::i32 && IsParity) |
| return SDValue(); |
| |
| // If there is no CNT instruction available, GPR popcount can |
| // be more efficiently lowered to the following sequence that uses |
| // AdvSIMD registers/instructions as long as the copies to/from |
| // the AdvSIMD registers are cheap. |
| // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd |
| // CNT V0.8B, V0.8B // 8xbyte pop-counts |
| // ADDV B0, V0.8B // sum 8xbyte pop-counts |
| // UMOV X0, V0.B[0] // copy byte result back to integer reg |
| if (VT == MVT::i32 || VT == MVT::i64) { |
| if (VT == MVT::i32) |
| Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); |
| Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); |
| |
| SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); |
| SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop); |
| UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV, |
| DAG.getConstant(0, DL, MVT::i64)); |
| |
| if (IsParity) |
| UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, |
| DAG.getConstant(1, DL, MVT::i32)); |
| |
| if (VT == MVT::i64) |
| UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); |
| return UaddLV; |
| } else if (VT == MVT::i128) { |
| Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); |
| |
| SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); |
| SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop); |
| UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV, |
| DAG.getConstant(0, DL, MVT::i64)); |
| |
| if (IsParity) |
| UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, |
| DAG.getConstant(1, DL, MVT::i32)); |
| |
| return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); |
| } |
| |
| assert(!IsParity && "ISD::PARITY of vector types not supported"); |
| |
| if (VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); |
| |
| assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || |
| VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && |
| "Unexpected type for custom ctpop lowering"); |
| |
| EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; |
| Val = DAG.getBitcast(VT8Bit, Val); |
| Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); |
| |
| // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. |
| unsigned EltSize = 8; |
| unsigned NumElts = VT.is64BitVector() ? 8 : 16; |
| while (EltSize != VT.getScalarSizeInBits()) { |
| EltSize *= 2; |
| NumElts /= 2; |
| MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); |
| Val = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); |
| } |
| |
| return Val; |
| } |
| |
| SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| assert(VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT( |
| VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())); |
| |
| SDLoc DL(Op); |
| SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); |
| return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); |
| } |
| |
| SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, |
| SelectionDAG &DAG) const { |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Opcode = Op.getOpcode(); |
| ISD::CondCode CC; |
| switch (Opcode) { |
| default: |
| llvm_unreachable("Wrong instruction"); |
| case ISD::SMAX: |
| CC = ISD::SETGT; |
| break; |
| case ISD::SMIN: |
| CC = ISD::SETLT; |
| break; |
| case ISD::UMAX: |
| CC = ISD::SETUGT; |
| break; |
| case ISD::UMIN: |
| CC = ISD::SETULT; |
| break; |
| } |
| |
| if (VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT( |
| VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { |
| switch (Opcode) { |
| default: |
| llvm_unreachable("Wrong instruction"); |
| case ISD::SMAX: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); |
| case ISD::SMIN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); |
| case ISD::UMAX: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); |
| case ISD::UMIN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); |
| } |
| } |
| |
| SDValue Op0 = Op.getOperand(0); |
| SDValue Op1 = Op.getOperand(1); |
| SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); |
| return DAG.getSelect(DL, VT, Cond, Op0, Op1); |
| } |
| |
| SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| if (VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT( |
| VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); |
| |
| SDLoc DL(Op); |
| SDValue REVB; |
| MVT VST; |
| |
| switch (VT.getSimpleVT().SimpleTy) { |
| default: |
| llvm_unreachable("Invalid type for bitreverse!"); |
| |
| case MVT::v2i32: { |
| VST = MVT::v8i8; |
| REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v4i32: { |
| VST = MVT::v16i8; |
| REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v1i64: { |
| VST = MVT::v8i8; |
| REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v2i64: { |
| VST = MVT::v16i8; |
| REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| } |
| |
| return DAG.getNode(AArch64ISD::NVCAST, DL, VT, |
| DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); |
| } |
| |
| // Check whether the continuous comparison sequence. |
| static bool |
| isOrXorChain(SDValue N, unsigned &Num, |
| SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) { |
| if (Num == MaxXors) |
| return false; |
| |
| // Skip the one-use zext |
| if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) |
| N = N->getOperand(0); |
| |
| // The leaf node must be XOR |
| if (N->getOpcode() == ISD::XOR) { |
| WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); |
| Num++; |
| return true; |
| } |
| |
| // All the non-leaf nodes must be OR. |
| if (N->getOpcode() != ISD::OR || !N->hasOneUse()) |
| return false; |
| |
| if (isOrXorChain(N->getOperand(0), Num, WorkList) && |
| isOrXorChain(N->getOperand(1), Num, WorkList)) |
| return true; |
| return false; |
| } |
| |
| // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp. |
| static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) { |
| SDValue LHS = N->getOperand(0); |
| SDValue RHS = N->getOperand(1); |
| SDLoc DL(N); |
| EVT VT = N->getValueType(0); |
| SmallVector<std::pair<SDValue, SDValue>, 16> WorkList; |
| |
| // Only handle integer compares. |
| if (N->getOpcode() != ISD::SETCC) |
| return SDValue(); |
| |
| ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); |
| // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: |
| // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag |
| unsigned NumXors = 0; |
| if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && |
| LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && |
| isOrXorChain(LHS, NumXors, WorkList)) { |
| SDValue XOR0, XOR1; |
| std::tie(XOR0, XOR1) = WorkList[0]; |
| unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR; |
| SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); |
| for (unsigned I = 1; I < WorkList.size(); I++) { |
| std::tie(XOR0, XOR1) = WorkList[I]; |
| SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); |
| Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain); |
| } |
| |
| // Exit early by inverting the condition, which help reduce indentations. |
| return Cmp; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { |
| |
| if (Op.getValueType().isVector()) |
| return LowerVSETCC(Op, DAG); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; |
| unsigned OpNo = IsStrict ? 1 : 0; |
| SDValue Chain; |
| if (IsStrict) |
| Chain = Op.getOperand(0); |
| SDValue LHS = Op.getOperand(OpNo + 0); |
| SDValue RHS = Op.getOperand(OpNo + 1); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); |
| SDLoc dl(Op); |
| |
| // We chose ZeroOrOneBooleanContents, so use zero and one. |
| EVT VT = Op.getValueType(); |
| SDValue TVal = DAG.getConstant(1, dl, VT); |
| SDValue FVal = DAG.getConstant(0, dl, VT); |
| |
| // Handle f128 first, since one possible outcome is a normal integer |
| // comparison which gets picked up by the next if statement. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, |
| IsSignaling); |
| |
| // If softenSetCCOperands returned a scalar, use it. |
| if (!RHS.getNode()) { |
| assert(LHS.getValueType() == Op.getValueType() && |
| "Unexpected setcc expansion!"); |
| return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; |
| } |
| } |
| |
| if (LHS.getValueType().isInteger()) { |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp( |
| LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); |
| |
| // Note that we inverted the condition above, so we reverse the order of |
| // the true and false operands here. This will allow the setcc to be |
| // matched to a single CSINC instruction. |
| SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); |
| return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; |
| } |
| |
| // Now we know we're dealing with FP values. |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || |
| LHS.getValueType() == MVT::f64); |
| |
| // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead |
| // and do the comparison. |
| SDValue Cmp; |
| if (IsStrict) |
| Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); |
| else |
| Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| SDValue Res; |
| if (CC2 == AArch64CC::AL) { |
| changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, |
| CC2); |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| |
| // Note that we inverted the condition above, so we reverse the order of |
| // the true and false operands here. This will allow the setcc to be |
| // matched to a single CSINC instruction. |
| Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); |
| } else { |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't |
| // totally clean. Some of them require two CSELs to implement. As is in |
| // this case, we emit the first CSEL and then emit a second using the output |
| // of the first as the RHS. We're effectively OR'ing the two CC's together. |
| |
| // FIXME: It would be nice if we could match the two CSELs to two CSINCs. |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue CS1 = |
| DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); |
| |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); |
| } |
| return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; |
| } |
| |
| SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, |
| SelectionDAG &DAG) const { |
| |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| EVT VT = LHS.getValueType(); |
| if (VT != MVT::i32 && VT != MVT::i64) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| SDValue Carry = Op.getOperand(2); |
| // SBCS uses a carry not a borrow so the carry flag should be inverted first. |
| SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); |
| SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), |
| LHS, RHS, InvCarry); |
| |
| EVT OpVT = Op.getValueType(); |
| SDValue TVal = DAG.getConstant(1, DL, OpVT); |
| SDValue FVal = DAG.getConstant(0, DL, OpVT); |
| |
| ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get(); |
| ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); |
| SDValue CCVal = |
| DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); |
| // Inputs are swapped because the condition is inverted. This will allow |
| // matching with a single CSINC instruction. |
| return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, |
| Cmp.getValue(1)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, |
| SDValue RHS, SDValue TVal, |
| SDValue FVal, const SDLoc &dl, |
| SelectionDAG &DAG) const { |
| // Handle f128 first, because it will result in a comparison of some RTLIB |
| // call result against zero. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); |
| |
| // If softenSetCCOperands returned a scalar, we need to compare the result |
| // against zero to select between true and false values. |
| if (!RHS.getNode()) { |
| RHS = DAG.getConstant(0, dl, LHS.getValueType()); |
| CC = ISD::SETNE; |
| } |
| } |
| |
| // Also handle f16, for which we need to do a f32 comparison. |
| if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); |
| } |
| |
| // Next, handle integers. |
| if (LHS.getValueType().isInteger()) { |
| assert((LHS.getValueType() == RHS.getValueType()) && |
| (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); |
| |
| ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); |
| ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); |
| ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); |
| // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform |
| // into (OR (ASR lhs, N-1), 1), which requires less instructions for the |
| // supported types. |
| if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && |
| CTVal->isOne() && CFVal->isAllOnes() && |
| LHS.getValueType() == TVal.getValueType()) { |
| EVT VT = LHS.getValueType(); |
| SDValue Shift = |
| DAG.getNode(ISD::SRA, dl, VT, LHS, |
| DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); |
| return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); |
| } |
| |
| // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns. |
| // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1)) |
| // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1)) |
| // Both require less instructions than compare and conditional select. |
| if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal && |
| RHSC && RHSC->isZero() && CFVal && CFVal->isZero() && |
| LHS.getValueType() == RHS.getValueType()) { |
| EVT VT = LHS.getValueType(); |
| SDValue Shift = |
| DAG.getNode(ISD::SRA, dl, VT, LHS, |
| DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); |
| |
| if (CC == ISD::SETGT) |
| Shift = DAG.getNOT(dl, Shift, VT); |
| |
| return DAG.getNode(ISD::AND, dl, VT, LHS, Shift); |
| } |
| |
| unsigned Opcode = AArch64ISD::CSEL; |
| |
| // If both the TVal and the FVal are constants, see if we can swap them in |
| // order to for a CSINV or CSINC out of them. |
| if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } else if (TVal.getOpcode() == ISD::XOR) { |
| // If TVal is a NOT we want to swap TVal and FVal so that we can match |
| // with a CSINV rather than a CSEL. |
| if (isAllOnesConstant(TVal.getOperand(1))) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| } else if (TVal.getOpcode() == ISD::SUB) { |
| // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so |
| // that we can match with a CSNEG rather than a CSEL. |
| if (isNullConstant(TVal.getOperand(0))) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| } else if (CTVal && CFVal) { |
| const int64_t TrueVal = CTVal->getSExtValue(); |
| const int64_t FalseVal = CFVal->getSExtValue(); |
| bool Swap = false; |
| |
| // If both TVal and FVal are constants, see if FVal is the |
| // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC |
| // instead of a CSEL in that case. |
| if (TrueVal == ~FalseVal) { |
| Opcode = AArch64ISD::CSINV; |
| } else if (FalseVal > std::numeric_limits<int64_t>::min() && |
| TrueVal == -FalseVal) { |
| Opcode = AArch64ISD::CSNEG; |
| } else if (TVal.getValueType() == MVT::i32) { |
| // If our operands are only 32-bit wide, make sure we use 32-bit |
| // arithmetic for the check whether we can use CSINC. This ensures that |
| // the addition in the check will wrap around properly in case there is |
| // an overflow (which would not be the case if we do the check with |
| // 64-bit arithmetic). |
| const uint32_t TrueVal32 = CTVal->getZExtValue(); |
| const uint32_t FalseVal32 = CFVal->getZExtValue(); |
| |
| if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { |
| Opcode = AArch64ISD::CSINC; |
| |
| if (TrueVal32 > FalseVal32) { |
| Swap = true; |
| } |
| } |
| } else { |
| // 64-bit check whether we can use CSINC. |
| const uint64_t TrueVal64 = TrueVal; |
| const uint64_t FalseVal64 = FalseVal; |
| |
| if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) { |
| Opcode = AArch64ISD::CSINC; |
| |
| if (TrueVal > FalseVal) { |
| Swap = true; |
| } |
| } |
| } |
| |
| // Swap TVal and FVal if necessary. |
| if (Swap) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| |
| if (Opcode != AArch64ISD::CSEL) { |
| // Drop FVal since we can get its value by simply inverting/negating |
| // TVal. |
| FVal = TVal; |
| } |
| } |
| |
| // Avoid materializing a constant when possible by reusing a known value in |
| // a register. However, don't perform this optimization if the known value |
| // is one, zero or negative one in the case of a CSEL. We can always |
| // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the |
| // FVal, respectively. |
| ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); |
| if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && |
| !RHSVal->isZero() && !RHSVal->isAllOnes()) { |
| AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); |
| // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to |
| // "a != C ? x : a" to avoid materializing C. |
| if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) |
| TVal = LHS; |
| else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) |
| FVal = LHS; |
| } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { |
| assert (CTVal && CFVal && "Expected constant operands for CSNEG."); |
| // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to |
| // avoid materializing C. |
| AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); |
| if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { |
| Opcode = AArch64ISD::CSINV; |
| TVal = LHS; |
| FVal = DAG.getConstant(0, dl, FVal.getValueType()); |
| } |
| } |
| |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| EVT VT = TVal.getValueType(); |
| return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); |
| } |
| |
| // Now we know we're dealing with FP values. |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || |
| LHS.getValueType() == MVT::f64); |
| assert(LHS.getValueType() == RHS.getValueType()); |
| EVT VT = TVal.getValueType(); |
| SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally |
| // clean. Some of them require two CSELs to implement. |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| |
| if (DAG.getTarget().Options.UnsafeFPMath) { |
| // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and |
| // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. |
| ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); |
| if (RHSVal && RHSVal->isZero()) { |
| ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); |
| ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); |
| |
| if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && |
| CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) |
| TVal = LHS; |
| else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && |
| CFVal && CFVal->isZero() && |
| FVal.getValueType() == LHS.getValueType()) |
| FVal = LHS; |
| } |
| } |
| |
| // Emit first, and possibly only, CSEL. |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); |
| |
| // If we need a second CSEL, emit it, using the output of the first as the |
| // RHS. We're effectively OR'ing the two CC's together. |
| if (CC2 != AArch64CC::AL) { |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); |
| } |
| |
| // Otherwise, return the output of the first CSEL. |
| return CS1; |
| } |
| |
| SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT Ty = Op.getValueType(); |
| auto Idx = Op.getConstantOperandAPInt(2); |
| int64_t IdxVal = Idx.getSExtValue(); |
| assert(Ty.isScalableVector() && |
| "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); |
| |
| // We can use the splice instruction for certain index values where we are |
| // able to efficiently generate the correct predicate. The index will be |
| // inverted and used directly as the input to the ptrue instruction, i.e. |
| // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the |
| // splice predicate. However, we can only do this if we can guarantee that |
| // there are enough elements in the vector, hence we check the index <= min |
| // number of elements. |
| std::optional<unsigned> PredPattern; |
| if (Ty.isScalableVector() && IdxVal < 0 && |
| (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != |
| std::nullopt) { |
| SDLoc DL(Op); |
| |
| // Create a predicate where all but the last -IdxVal elements are false. |
| EVT PredVT = Ty.changeVectorElementType(MVT::i1); |
| SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); |
| Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); |
| |
| // Now splice the two inputs together using the predicate. |
| return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), |
| Op.getOperand(1)); |
| } |
| |
| // This will select to an EXT instruction, which has a maximum immediate |
| // value of 255, hence 2048-bits is the maximum value we can lower. |
| if (IdxVal >= 0 && |
| IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) |
| return Op; |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, |
| SelectionDAG &DAG) const { |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| SDValue TVal = Op.getOperand(2); |
| SDValue FVal = Op.getOperand(3); |
| SDLoc DL(Op); |
| return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDValue CCVal = Op->getOperand(0); |
| SDValue TVal = Op->getOperand(1); |
| SDValue FVal = Op->getOperand(2); |
| SDLoc DL(Op); |
| |
| EVT Ty = Op.getValueType(); |
| if (Ty == MVT::aarch64svcount) { |
| TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal); |
| FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal); |
| SDValue Sel = |
| DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal); |
| return DAG.getNode(ISD::BITCAST, DL, Ty, Sel); |
| } |
| |
| if (Ty.isScalableVector()) { |
| MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); |
| SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal); |
| return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) { |
| // FIXME: Ideally this would be the same as above using i1 types, however |
| // for the moment we can't deal with fixed i1 vector types properly, so |
| // instead extend the predicate to a result type sized integer vector. |
| MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); |
| MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); |
| SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); |
| SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); |
| return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); |
| } |
| |
| // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select |
| // instruction. |
| if (ISD::isOverflowIntrOpRes(CCVal)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) |
| return SDValue(); |
| |
| AArch64CC::CondCode OFCC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); |
| SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); |
| |
| return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, |
| CCVal, Overflow); |
| } |
| |
| // Lower it the same way as we would lower a SELECT_CC node. |
| ISD::CondCode CC; |
| SDValue LHS, RHS; |
| if (CCVal.getOpcode() == ISD::SETCC) { |
| LHS = CCVal.getOperand(0); |
| RHS = CCVal.getOperand(1); |
| CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get(); |
| } else { |
| LHS = CCVal; |
| RHS = DAG.getConstant(0, DL, CCVal.getValueType()); |
| CC = ISD::SETNE; |
| } |
| |
| // If we are lowering a f16 and we do not have fullf16, convert to a f32 in |
| // order to use FCSELSrrr |
| if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { |
| TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, |
| DAG.getUNDEF(MVT::f32), TVal); |
| FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, |
| DAG.getUNDEF(MVT::f32), FVal); |
| } |
| |
| SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); |
| |
| if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { |
| return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); |
| } |
| |
| return Res; |
| } |
| |
| SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Jump table entries as PC relative offsets. No additional tweaking |
| // is necessary here. Just get the address of the jump table. |
| JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| !Subtarget->isTargetMachO()) { |
| return getAddrLarge(JT, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(JT, DAG); |
| } |
| return getAddr(JT, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Jump table entries as PC relative offsets. No additional tweaking |
| // is necessary here. Just get the address of the jump table. |
| SDLoc DL(Op); |
| SDValue JT = Op.getOperand(1); |
| SDValue Entry = Op.getOperand(2); |
| int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); |
| |
| auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| AFI->setJumpTableEntryInfo(JTI, 4, nullptr); |
| |
| SDNode *Dest = |
| DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, |
| Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); |
| SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL); |
| return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, |
| SelectionDAG &DAG) const { |
| ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large) { |
| // Use the GOT for the large code model on iOS. |
| if (Subtarget->isTargetMachO()) { |
| return getGOT(CP, DAG); |
| } |
| return getAddrLarge(CP, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(CP, DAG); |
| } else { |
| return getAddr(CP, DAG); |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| !Subtarget->isTargetMachO()) { |
| return getAddrLarge(BA, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(BA, DAG); |
| } |
| return getAddr(BA, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| AArch64FunctionInfo *FuncInfo = |
| DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| |
| SDLoc DL(Op); |
| SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), |
| getPointerTy(DAG.getDataLayout())); |
| FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), |
| MachinePointerInfo(SV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| SDLoc DL(Op); |
| SDValue FR; |
| if (Subtarget->isWindowsArm64EC()) { |
| // With the Arm64EC ABI, we compute the address of the varargs save area |
| // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry, |
| // but calls from an entry thunk can pass in a different address. |
| Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); |
| SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64); |
| uint64_t StackOffset; |
| if (FuncInfo->getVarArgsGPRSize() > 0) |
| StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize(); |
| else |
| StackOffset = FuncInfo->getVarArgsStackOffset(); |
| FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, |
| DAG.getConstant(StackOffset, DL, MVT::i64)); |
| } else { |
| FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 |
| ? FuncInfo->getVarArgsGPRIndex() |
| : FuncInfo->getVarArgsStackIndex(), |
| getPointerTy(DAG.getDataLayout())); |
| } |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), |
| MachinePointerInfo(SV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| // The layout of the va_list struct is specified in the AArch64 Procedure Call |
| // Standard, section B.3. |
| MachineFunction &MF = DAG.getMachineFunction(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; |
| auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| |
| SDValue Chain = Op.getOperand(0); |
| SDValue VAList = Op.getOperand(1); |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| SmallVector<SDValue, 4> MemOps; |
| |
| // void *__stack at offset 0 |
| unsigned Offset = 0; |
| SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); |
| Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); |
| MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, |
| MachinePointerInfo(SV), Align(PtrSize))); |
| |
| // void *__gr_top at offset 8 (4 on ILP32) |
| Offset += PtrSize; |
| int GPRSize = FuncInfo->getVarArgsGPRSize(); |
| if (GPRSize > 0) { |
| SDValue GRTop, GRTopAddr; |
| |
| GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| |
| GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); |
| GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, |
| DAG.getConstant(GPRSize, DL, PtrVT)); |
| GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); |
| |
| MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, |
| MachinePointerInfo(SV, Offset), |
| Align(PtrSize))); |
| } |
| |
| // void *__vr_top at offset 16 (8 on ILP32) |
| Offset += PtrSize; |
| int FPRSize = FuncInfo->getVarArgsFPRSize(); |
| if (FPRSize > 0) { |
| SDValue VRTop, VRTopAddr; |
| VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| |
| VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); |
| VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, |
| DAG.getConstant(FPRSize, DL, PtrVT)); |
| VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); |
| |
| MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, |
| MachinePointerInfo(SV, Offset), |
| Align(PtrSize))); |
| } |
| |
| // int __gr_offs at offset 24 (12 on ILP32) |
| Offset += PtrSize; |
| SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| MemOps.push_back( |
| DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), |
| GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); |
| |
| // int __vr_offs at offset 28 (16 on ILP32) |
| Offset += 4; |
| SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| MemOps.push_back( |
| DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), |
| VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); |
| |
| return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| |
| if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) |
| return LowerWin64_VASTART(Op, DAG); |
| else if (Subtarget->isTargetDarwin()) |
| return LowerDarwin_VASTART(Op, DAG); |
| else |
| return LowerAAPCS_VASTART(Op, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single |
| // pointer. |
| SDLoc DL(Op); |
| unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; |
| unsigned VaListSize = |
| (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) |
| ? PtrSize |
| : Subtarget->isTargetILP32() ? 20 : 32; |
| const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); |
| const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); |
| |
| return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), |
| DAG.getConstant(VaListSize, DL, MVT::i32), |
| Align(PtrSize), false, false, false, |
| MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetDarwin() && |
| "automatic va_arg instruction only works on Darwin"); |
| |
| const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| SDValue Chain = Op.getOperand(0); |
| SDValue Addr = Op.getOperand(1); |
| MaybeAlign Align(Op.getConstantOperandVal(3)); |
| unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| SDValue VAList = |
| DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); |
| Chain = VAList.getValue(1); |
| VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); |
| |
| if (VT.isScalableVector()) |
| report_fatal_error("Passing SVE types to variadic functions is " |
| "currently not supported"); |
| |
| if (Align && *Align > MinSlotSize) { |
| VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Align->value() - 1, DL, PtrVT)); |
| VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, |
| DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); |
| } |
| |
| Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); |
| unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); |
| |
| // Scalar integer and FP values smaller than 64 bits are implicitly extended |
| // up to 64 bits. At the very least, we have to increase the striding of the |
| // vaargs list to match this, and for FP values we need to introduce |
| // FP_ROUND nodes as well. |
| if (VT.isInteger() && !VT.isVector()) |
| ArgSize = std::max(ArgSize, MinSlotSize); |
| bool NeedFPTrunc = false; |
| if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { |
| ArgSize = 8; |
| NeedFPTrunc = true; |
| } |
| |
| // Increment the pointer, VAList, to the next vaarg |
| SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(ArgSize, DL, PtrVT)); |
| VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); |
| |
| // Store the incremented VAList to the legalized pointer |
| SDValue APStore = |
| DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); |
| |
| // Load the actual argument out of the pointer VAList |
| if (NeedFPTrunc) { |
| // Load the value as an f64. |
| SDValue WideFP = |
| DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); |
| // Round the value down to an f32. |
| SDValue NarrowFP = |
| DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), |
| DAG.getIntPtrConstant(1, DL, /*isTarget=*/true)); |
| SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; |
| // Merge the rounded value with the chain output of the load. |
| return DAG.getMergeValues(Ops, DL); |
| } |
| |
| return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| MFI.setFrameAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDValue FrameAddr = |
| DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); |
| while (Depth--) |
| FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, |
| MachinePointerInfo()); |
| |
| if (Subtarget->isTargetILP32()) |
| FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, |
| DAG.getValueType(VT)); |
| |
| return FrameAddr; |
| } |
| |
| SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| |
| EVT VT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| int FI = MFI.CreateFixedObject(4, 0, false); |
| return DAG.getFrameIndex(FI, VT); |
| } |
| |
| #define GET_REGISTER_MATCHER |
| #include "AArch64GenAsmMatcher.inc" |
| |
| // FIXME? Maybe this could be a TableGen attribute on some registers and |
| // this table could be generated automatically from RegInfo. |
| Register AArch64TargetLowering:: |
| getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { |
| Register Reg = MatchRegisterName(RegName); |
| if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { |
| const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); |
| unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); |
| if (!Subtarget->isXRegisterReserved(DwarfRegNum)) |
| Reg = 0; |
| } |
| if (Reg) |
| return Reg; |
| report_fatal_error(Twine("Invalid register name \"" |
| + StringRef(RegName) + "\".")); |
| } |
| |
| SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| |
| SDValue FrameAddr = |
| DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); |
| SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); |
| |
| return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); |
| } |
| |
| SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| MFI.setReturnAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDValue ReturnAddress; |
| if (Depth) { |
| SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); |
| SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); |
| ReturnAddress = DAG.getLoad( |
| VT, DL, DAG.getEntryNode(), |
| DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); |
| } else { |
| // Return LR, which contains the return address. Mark it an implicit |
| // live-in. |
| Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); |
| ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); |
| } |
| |
| // The XPACLRI instruction assembles to a hint-space instruction before |
| // Armv8.3-A therefore this instruction can be safely used for any pre |
| // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use |
| // that instead. |
| SDNode *St; |
| if (Subtarget->hasPAuth()) { |
| St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); |
| } else { |
| // XPACLRI operates on LR therefore we must move the operand accordingly. |
| SDValue Chain = |
| DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); |
| St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); |
| } |
| return SDValue(St, 0); |
| } |
| |
| /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two |
| /// i32 values and take a 2 x i32 value to shift plus a shift amount. |
| SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDValue Lo, Hi; |
| expandShiftParts(Op.getNode(), Lo, Hi, DAG); |
| return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); |
| } |
| |
| bool AArch64TargetLowering::isOffsetFoldingLegal( |
| const GlobalAddressSDNode *GA) const { |
| // Offsets are folded in the DAG combine rather than here so that we can |
| // intelligently choose an offset based on the uses. |
| return false; |
| } |
| |
| bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, |
| bool OptForSize) const { |
| bool IsLegal = false; |
| // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and |
| // 16-bit case when target has full fp16 support. |
| // FIXME: We should be able to handle f128 as well with a clever lowering. |
| const APInt ImmInt = Imm.bitcastToAPInt(); |
| if (VT == MVT::f64) |
| IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); |
| else if (VT == MVT::f32) |
| IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); |
| else if (VT == MVT::f16 || VT == MVT::bf16) |
| IsLegal = |
| (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) || |
| Imm.isPosZero(); |
| |
| // If we can not materialize in immediate field for fmov, check if the |
| // value can be encoded as the immediate operand of a logical instruction. |
| // The immediate value will be created with either MOVZ, MOVN, or ORR. |
| // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to |
| // generate that fmov. |
| if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { |
| // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; |
| // however the mov+fmov sequence is always better because of the reduced |
| // cache pressure. The timings are still the same if you consider |
| // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the |
| // movw+movk is fused). So we limit up to 2 instrdduction at most. |
| SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
| AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); |
| unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); |
| IsLegal = Insn.size() <= Limit; |
| } |
| |
| LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT |
| << " imm value: "; Imm.dump();); |
| return IsLegal; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Optimization Hooks |
| //===----------------------------------------------------------------------===// |
| |
| static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, |
| SDValue Operand, SelectionDAG &DAG, |
| int &ExtraSteps) { |
| EVT VT = Operand.getValueType(); |
| if ((ST->hasNEON() && |
| (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || |
| VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || |
| VT == MVT::v4f32)) || |
| (ST->hasSVE() && |
| (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { |
| if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) |
| // For the reciprocal estimates, convergence is quadratic, so the number |
| // of digits is doubled after each iteration. In ARMv8, the accuracy of |
| // the initial estimate is 2^-8. Thus the number of extra steps to refine |
| // the result for float (23 mantissa bits) is 2 and for double (52 |
| // mantissa bits) is 3. |
| ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; |
| |
| return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue |
| AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, |
| const DenormalMode &Mode) const { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); |
| SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); |
| return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); |
| } |
| |
| SDValue |
| AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, |
| SelectionDAG &DAG) const { |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, |
| SelectionDAG &DAG, int Enabled, |
| int &ExtraSteps, |
| bool &UseOneConst, |
| bool Reciprocal) const { |
| if (Enabled == ReciprocalEstimate::Enabled || |
| (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) |
| if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, |
| DAG, ExtraSteps)) { |
| SDLoc DL(Operand); |
| EVT VT = Operand.getValueType(); |
| |
| SDNodeFlags Flags; |
| Flags.setAllowReassociation(true); |
| |
| // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) |
| // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) |
| for (int i = ExtraSteps; i > 0; --i) { |
| SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, |
| Flags); |
| Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); |
| } |
| if (!Reciprocal) |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); |
| |
| ExtraSteps = 0; |
| return Estimate; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, |
| SelectionDAG &DAG, int Enabled, |
| int &ExtraSteps) const { |
| if (Enabled == ReciprocalEstimate::Enabled) |
| if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, |
| DAG, ExtraSteps)) { |
| SDLoc DL(Operand); |
| EVT VT = Operand.getValueType(); |
| |
| SDNodeFlags Flags; |
| Flags.setAllowReassociation(true); |
| |
| // Newton reciprocal iteration: E * (2 - X * E) |
| // AArch64 reciprocal iteration instruction: (2 - M * N) |
| for (int i = ExtraSteps; i > 0; --i) { |
| SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, |
| Estimate, Flags); |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); |
| } |
| |
| ExtraSteps = 0; |
| return Estimate; |
| } |
| |
| return SDValue(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Inline Assembly Support |
| //===----------------------------------------------------------------------===// |
| |
| // Table of Constraints |
| // TODO: This is the current set of constraints supported by ARM for the |
| // compiler, not all of them may make sense. |
| // |
| // r - A general register |
| // w - An FP/SIMD register of some size in the range v0-v31 |
| // x - An FP/SIMD register of some size in the range v0-v15 |
| // I - Constant that can be used with an ADD instruction |
| // J - Constant that can be used with a SUB instruction |
| // K - Constant that can be used with a 32-bit logical instruction |
| // L - Constant that can be used with a 64-bit logical instruction |
| // M - Constant that can be used as a 32-bit MOV immediate |
| // N - Constant that can be used as a 64-bit MOV immediate |
| // Q - A memory reference with base register and no offset |
| // S - A symbolic address |
| // Y - Floating point constant zero |
| // Z - Integer constant zero |
| // |
| // Note that general register operands will be output using their 64-bit x |
| // register name, whatever the size of the variable, unless the asm operand |
| // is prefixed by the %w modifier. Floating-point and SIMD register operands |
| // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or |
| // %q modifier. |
| const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { |
| // At this point, we have to lower this constraint to something else, so we |
| // lower it to an "r" or "w". However, by doing this we will force the result |
| // to be in register, while the X constraint is much more permissive. |
| // |
| // Although we are correct (we are free to emit anything, without |
| // constraints), we might break use cases that would expect us to be more |
| // efficient and emit something else. |
| if (!Subtarget->hasFPARMv8()) |
| return "r"; |
| |
| if (ConstraintVT.isFloatingPoint()) |
| return "w"; |
| |
| if (ConstraintVT.isVector() && |
| (ConstraintVT.getSizeInBits() == 64 || |
| ConstraintVT.getSizeInBits() == 128)) |
| return "w"; |
| |
| return "r"; |
| } |
| |
| enum PredicateConstraint { Uph, Upl, Upa, Invalid }; |
| |
| static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { |
| return StringSwitch<PredicateConstraint>(Constraint) |
| .Case("Uph", PredicateConstraint::Uph) |
| .Case("Upl", PredicateConstraint::Upl) |
| .Case("Upa", PredicateConstraint::Upa) |
| .Default(PredicateConstraint::Invalid); |
| } |
| |
| static const TargetRegisterClass * |
| getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { |
| if (VT != MVT::aarch64svcount && |
| (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)) |
| return nullptr; |
| |
| switch (Constraint) { |
| default: |
| return nullptr; |
| case PredicateConstraint::Uph: |
| return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass |
| : &AArch64::PPR_p8to15RegClass; |
| case PredicateConstraint::Upl: |
| return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass |
| : &AArch64::PPR_3bRegClass; |
| case PredicateConstraint::Upa: |
| return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass |
| : &AArch64::PPRRegClass; |
| } |
| } |
| |
| // The set of cc code supported is from |
| // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands |
| static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { |
| AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint) |
| .Case("{@cchi}", AArch64CC::HI) |
| .Case("{@cccs}", AArch64CC::HS) |
| .Case("{@cclo}", AArch64CC::LO) |
| .Case("{@ccls}", AArch64CC::LS) |
| .Case("{@cccc}", AArch64CC::LO) |
| .Case("{@cceq}", AArch64CC::EQ) |
| .Case("{@ccgt}", AArch64CC::GT) |
| .Case("{@ccge}", AArch64CC::GE) |
| .Case("{@cclt}", AArch64CC::LT) |
| .Case("{@ccle}", AArch64CC::LE) |
| .Case("{@cchs}", AArch64CC::HS) |
| .Case("{@ccne}", AArch64CC::NE) |
| .Case("{@ccvc}", AArch64CC::VC) |
| .Case("{@ccpl}", AArch64CC::PL) |
| .Case("{@ccvs}", AArch64CC::VS) |
| .Case("{@ccmi}", AArch64CC::MI) |
| .Default(AArch64CC::Invalid); |
| return Cond; |
| } |
| |
| /// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, |
| /// WZR, invert(<cond>)'. |
| static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, |
| SelectionDAG &DAG) { |
| return DAG.getNode( |
| AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), |
| DAG.getConstant(0, DL, MVT::i32), |
| DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV); |
| } |
| |
| // Lower @cc flag output via getSETCC. |
| SDValue AArch64TargetLowering::LowerAsmOutputForConstraint( |
| SDValue &Chain, SDValue &Glue, const SDLoc &DL, |
| const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { |
| AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); |
| if (Cond == AArch64CC::Invalid) |
| return SDValue(); |
| // The output variable should be a scalar integer. |
| if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || |
| OpInfo.ConstraintVT.getSizeInBits() < 8) |
| report_fatal_error("Flag output operand is of invalid type"); |
| |
| // Get NZCV register. Only update chain when copyfrom is glued. |
| if (Glue.getNode()) { |
| Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue); |
| Chain = Glue.getValue(1); |
| } else |
| Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32); |
| // Extract CC code. |
| SDValue CC = getSETCC(Cond, Glue, DL, DAG); |
| |
| SDValue Result; |
| |
| // Truncate or ZERO_EXTEND based on value types. |
| if (OpInfo.ConstraintVT.getSizeInBits() <= 32) |
| Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC); |
| else |
| Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); |
| |
| return Result; |
| } |
| |
| /// getConstraintType - Given a constraint letter, return the type of |
| /// constraint it is for this target. |
| AArch64TargetLowering::ConstraintType |
| AArch64TargetLowering::getConstraintType(StringRef Constraint) const { |
| if (Constraint.size() == 1) { |
| switch (Constraint[0]) { |
| default: |
| break; |
| case 'x': |
| case 'w': |
| case 'y': |
| return C_RegisterClass; |
| // An address with a single base register. Due to the way we |
| // currently handle addresses it is the same as 'r'. |
| case 'Q': |
| return C_Memory; |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'Y': |
| case 'Z': |
| return C_Immediate; |
| case 'z': |
| case 'S': // A symbolic address |
| return C_Other; |
| } |
| } else if (parsePredicateConstraint(Constraint) != |
| PredicateConstraint::Invalid) |
| return C_RegisterClass; |
| else if (parseConstraintCode(Constraint) != AArch64CC::Invalid) |
| return C_Other; |
| return TargetLowering::getConstraintType(Constraint); |
| } |
| |
| /// Examine constraint type and operand type and determine a weight value. |
| /// This object must already have been set up with the operand type |
| /// and the current alternative constraint selected. |
| TargetLowering::ConstraintWeight |
| AArch64TargetLowering::getSingleConstraintMatchWeight( |
| AsmOperandInfo &info, const char *constraint) const { |
| ConstraintWeight weight = CW_Invalid; |
| Value *CallOperandVal = info.CallOperandVal; |
| // If we don't have a value, we can't do a match, |
| // but allow it at the lowest weight. |
| if (!CallOperandVal) |
| return CW_Default; |
| Type *type = CallOperandVal->getType(); |
| // Look at the constraint type. |
| switch (*constraint) { |
| default: |
| weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); |
| break; |
| case 'x': |
| case 'w': |
| case 'y': |
| if (type->isFloatingPointTy() || type->isVectorTy()) |
| weight = CW_Register; |
| break; |
| case 'z': |
| weight = CW_Constant; |
| break; |
| case 'U': |
| if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) |
| weight = CW_Register; |
| break; |
| } |
| return weight; |
| } |
| |
| std::pair<unsigned, const TargetRegisterClass *> |
| AArch64TargetLowering::getRegForInlineAsmConstraint( |
| const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { |
| if (Constraint.size() == 1) { |
| switch (Constraint[0]) { |
| case 'r': |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, nullptr); |
| if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) |
| return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); |
| if (VT.getFixedSizeInBits() == 64) |
| return std::make_pair(0U, &AArch64::GPR64commonRegClass); |
| return std::make_pair(0U, &AArch64::GPR32commonRegClass); |
| case 'w': { |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) { |
| if (VT.getVectorElementType() != MVT::i1) |
| return std::make_pair(0U, &AArch64::ZPRRegClass); |
| return std::make_pair(0U, nullptr); |
| } |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| if (VTSize == 16) |
| return std::make_pair(0U, &AArch64::FPR16RegClass); |
| if (VTSize == 32) |
| return std::make_pair(0U, &AArch64::FPR32RegClass); |
| if (VTSize == 64) |
| return std::make_pair(0U, &AArch64::FPR64RegClass); |
| if (VTSize == 128) |
| return std::make_pair(0U, &AArch64::FPR128RegClass); |
| break; |
| } |
| // The instructions that this constraint is designed for can |
| // only take 128-bit registers so just use that regclass. |
| case 'x': |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, &AArch64::ZPR_4bRegClass); |
| if (VT.getSizeInBits() == 128) |
| return std::make_pair(0U, &AArch64::FPR128_loRegClass); |
| break; |
| case 'y': |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, &AArch64::ZPR_3bRegClass); |
| break; |
| } |
| } else { |
| PredicateConstraint PC = parsePredicateConstraint(Constraint); |
| if (const TargetRegisterClass *RegClass = getPredicateRegisterClass(PC, VT)) |
| return std::make_pair(0U, RegClass); |
| } |
| if (StringRef("{cc}").equals_insensitive(Constraint) || |
| parseConstraintCode(Constraint) != AArch64CC::Invalid) |
| return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); |
| |
| // Use the default implementation in TargetLowering to convert the register |
| // constraint into a member of a register class. |
| std::pair<unsigned, const TargetRegisterClass *> Res; |
| Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
| |
| // Not found as a standard register? |
| if (!Res.second) { |
| unsigned Size = Constraint.size(); |
| if ((Size == 4 || Size == 5) && Constraint[0] == '{' && |
| tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { |
| int RegNo; |
| bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); |
| if (!Failed && RegNo >= 0 && RegNo <= 31) { |
| // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. |
| // By default we'll emit v0-v31 for this unless there's a modifier where |
| // we'll emit the correct register as well. |
| if (VT != MVT::Other && VT.getSizeInBits() == 64) { |
| Res.first = AArch64::FPR64RegClass.getRegister(RegNo); |
| Res.second = &AArch64::FPR64RegClass; |
| } else { |
| Res.first = AArch64::FPR128RegClass.getRegister(RegNo); |
| Res.second = &AArch64::FPR128RegClass; |
| } |
| } |
| } |
| } |
| |
| if (Res.second && !Subtarget->hasFPARMv8() && |
| !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && |
| !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) |
| return std::make_pair(0U, nullptr); |
| |
| return Res; |
| } |
| |
| EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, |
| llvm::Type *Ty, |
| bool AllowUnknown) const { |
| if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) |
| return EVT(MVT::i64x8); |
| |
| return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); |
| } |
| |
| /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops |
| /// vector. If it is invalid, don't add anything to Ops. |
| void AArch64TargetLowering::LowerAsmOperandForConstraint( |
| SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, |
| SelectionDAG &DAG) const { |
| SDValue Result; |
| |
| // Currently only support length 1 constraints. |
| if (Constraint.size() != 1) |
| return; |
| |
| char ConstraintLetter = Constraint[0]; |
| switch (ConstraintLetter) { |
| default: |
| break; |
| |
| // This set of constraints deal with valid constants for various instructions. |
| // Validate and return a target constant for them if we can. |
| case 'z': { |
| // 'z' maps to xzr or wzr so it needs an input of 0. |
| if (!isNullConstant(Op)) |
| return; |
| |
| if (Op.getValueType() == MVT::i64) |
| Result = DAG.getRegister(AArch64::XZR, MVT::i64); |
| else |
| Result = DAG.getRegister(AArch64::WZR, MVT::i32); |
| break; |
| } |
| case 'S': { |
| // An absolute symbolic address or label reference. |
| if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { |
| Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), |
| GA->getValueType(0)); |
| } else if (const BlockAddressSDNode *BA = |
| dyn_cast<BlockAddressSDNode>(Op)) { |
| Result = |
| DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); |
| } else |
| return; |
| break; |
| } |
| |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); |
| if (!C) |
| return; |
| |
| // Grab the value and do some validation. |
| uint64_t CVal = C->getZExtValue(); |
| switch (ConstraintLetter) { |
| // The I constraint applies only to simple ADD or SUB immediate operands: |
| // i.e. 0 to 4095 with optional shift by 12 |
| // The J constraint applies only to ADD or SUB immediates that would be |
| // valid when negated, i.e. if [an add pattern] were to be output as a SUB |
| // instruction [or vice versa], in other words -1 to -4095 with optional |
| // left shift by 12. |
| case 'I': |
| if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) |
| break; |
| return; |
| case 'J': { |
| uint64_t NVal = -C->getSExtValue(); |
| if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { |
| CVal = C->getSExtValue(); |
| break; |
| } |
| return; |
| } |
| // The K and L constraints apply *only* to logical immediates, including |
| // what used to be the MOVI alias for ORR (though the MOVI alias has now |
| // been removed and MOV should be used). So these constraints have to |
| // distinguish between bit patterns that are valid 32-bit or 64-bit |
| // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but |
| // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice |
| // versa. |
| case 'K': |
| if (AArch64_AM::isLogicalImmediate(CVal, 32)) |
| break; |
| return; |
| case 'L': |
| if (AArch64_AM::isLogicalImmediate(CVal, 64)) |
| break; |
| return; |
| // The M and N constraints are a superset of K and L respectively, for use |
| // with the MOV (immediate) alias. As well as the logical immediates they |
| // also match 32 or 64-bit immediates that can be loaded either using a |
| // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca |
| // (M) or 64-bit 0x1234000000000000 (N) etc. |
| // As a note some of this code is liberally stolen from the asm parser. |
| case 'M': { |
| if (!isUInt<32>(CVal)) |
| return; |
| if (AArch64_AM::isLogicalImmediate(CVal, 32)) |
| break; |
| if ((CVal & 0xFFFF) == CVal) |
| break; |
| if ((CVal & 0xFFFF0000ULL) == CVal) |
| break; |
| uint64_t NCVal = ~(uint32_t)CVal; |
| if ((NCVal & 0xFFFFULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF0000ULL) == NCVal) |
| break; |
| return; |
| } |
| case 'N': { |
| if (AArch64_AM::isLogicalImmediate(CVal, 64)) |
| break; |
| if ((CVal & 0xFFFFULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF0000ULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF00000000ULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF000000000000ULL) == CVal) |
| break; |
| uint64_t NCVal = ~CVal; |
| if ((NCVal & 0xFFFFULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF0000ULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF00000000ULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF000000000000ULL) == NCVal) |
| break; |
| return; |
| } |
| default: |
| return; |
| } |
| |
| // All assembler immediates are 64-bit integers. |
| Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); |
| break; |
| } |
| |
| if (Result.getNode()) { |
| Ops.push_back(Result); |
| return; |
| } |
| |
| return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Advanced SIMD Support |
| //===----------------------------------------------------------------------===// |
| |
| /// WidenVector - Given a value in the V64 register class, produce the |
| /// equivalent value in the V128 register class. |
| static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { |
| EVT VT = V64Reg.getValueType(); |
| unsigned NarrowSize = VT.getVectorNumElements(); |
| MVT EltTy = VT.getVectorElementType().getSimpleVT(); |
| MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); |
| SDLoc DL(V64Reg); |
| |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), |
| V64Reg, DAG.getConstant(0, DL, MVT::i64)); |
| } |
| |
| /// getExtFactor - Determine the adjustment factor for the position when |
| /// generating an "extract from vector registers" instruction. |
| static unsigned getExtFactor(SDValue &V) { |
| EVT EltType = V.getValueType().getVectorElementType(); |
| return EltType.getSizeInBits() / 8; |
| } |
| |
| // Check if a vector is built from one vector via extracted elements of |
| // another together with an AND mask, ensuring that all elements fit |
| // within range. This can be reconstructed using AND and NEON's TBL1. |
| SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { |
| assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| assert(!VT.isScalableVector() && |
| "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); |
| |
| // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map |
| // directly to TBL1. |
| if (VT != MVT::v16i8 && VT != MVT::v8i8) |
| return SDValue(); |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| assert((NumElts == 8 || NumElts == 16) && |
| "Need to have exactly 8 or 16 elements in vector."); |
| |
| SDValue SourceVec; |
| SDValue MaskSourceVec; |
| SmallVector<SDValue, 16> AndMaskConstants; |
| |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| return SDValue(); |
| |
| SDValue OperandSourceVec = V.getOperand(0); |
| if (!SourceVec) |
| SourceVec = OperandSourceVec; |
| else if (SourceVec != OperandSourceVec) |
| return SDValue(); |
| |
| // This only looks at shuffles with elements that are |
| // a) truncated by a constant AND mask extracted from a mask vector, or |
| // b) extracted directly from a mask vector. |
| SDValue MaskSource = V.getOperand(1); |
| if (MaskSource.getOpcode() == ISD::AND) { |
| if (!isa<ConstantSDNode>(MaskSource.getOperand(1))) |
| return SDValue(); |
| |
| AndMaskConstants.push_back(MaskSource.getOperand(1)); |
| MaskSource = MaskSource->getOperand(0); |
| } else if (!AndMaskConstants.empty()) { |
| // Either all or no operands should have an AND mask. |
| return SDValue(); |
| } |
| |
| // An ANY_EXTEND may be inserted between the AND and the source vector |
| // extraction. We don't care about that, so we can just skip it. |
| if (MaskSource.getOpcode() == ISD::ANY_EXTEND) |
| MaskSource = MaskSource.getOperand(0); |
| |
| if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| return SDValue(); |
| |
| SDValue MaskIdx = MaskSource.getOperand(1); |
| if (!isa<ConstantSDNode>(MaskIdx) || |
| !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i)) |
| return SDValue(); |
| |
| // We only apply this if all elements come from the same vector with the |
| // same vector type. |
| if (!MaskSourceVec) { |
| MaskSourceVec = MaskSource->getOperand(0); |
| if (MaskSourceVec.getValueType() != VT) |
| return SDValue(); |
| } else if (MaskSourceVec != MaskSource->getOperand(0)) { |
| return SDValue(); |
| } |
| } |
| |
| // We need a v16i8 for TBL, so we extend the source with a placeholder vector |
| // for v8i8 to get a v16i8. As the pattern we are replacing is extract + |
| // insert, we know that the index in the mask must be smaller than the number |
| // of elements in the source, or we would have an out-of-bounds access. |
| if (NumElts == 8) |
| SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec, |
| DAG.getUNDEF(VT)); |
| |
| // Preconditions met, so we can use a vector (AND +) TBL to build this vector. |
| if (!AndMaskConstants.empty()) |
| MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec, |
| DAG.getBuildVector(VT, dl, AndMaskConstants)); |
| |
| return DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, dl, VT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec, |
| MaskSourceVec); |
| } |
| |
| // Gather data to see if the operation can be modelled as a |
| // shuffle in combination with VEXTs. |
| SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| assert(!VT.isScalableVector() && |
| "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| struct ShuffleSourceInfo { |
| SDValue Vec; |
| unsigned MinElt; |
| unsigned MaxElt; |
| |
| // We may insert some combination of BITCASTs and VEXT nodes to force Vec to |
| // be compatible with the shuffle we intend to construct. As a result |
| // ShuffleVec will be some sliding window into the original Vec. |
| SDValue ShuffleVec; |
| |
| // Code should guarantee that element i in Vec starts at element "WindowBase |
| // + i * WindowScale in ShuffleVec". |
| int WindowBase; |
| int WindowScale; |
| |
| ShuffleSourceInfo(SDValue Vec) |
| : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), |
| ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} |
| |
| bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } |
| }; |
| |
| // First gather all vectors used as an immediate source for this BUILD_VECTOR |
| // node. |
| SmallVector<ShuffleSourceInfo, 2> Sources; |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| if (V.isUndef()) |
| continue; |
| else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| !isa<ConstantSDNode>(V.getOperand(1)) || |
| V.getOperand(0).getValueType().isScalableVector()) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: " |
| "a shuffle can only come from building a vector from " |
| "various elements of other fixed-width vectors, provided " |
| "their indices are constant\n"); |
| return SDValue(); |
| } |
| |
| // Add this element source to the list if it's not already there. |
| SDValue SourceVec = V.getOperand(0); |
| auto Source = find(Sources, SourceVec); |
| if (Source == Sources.end()) |
| Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); |
| |
| // Update the minimum and maximum lane number seen. |
| unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); |
| Source->MinElt = std::min(Source->MinElt, EltNo); |
| Source->MaxElt = std::max(Source->MaxElt, EltNo); |
| } |
| |
| // If we have 3 or 4 sources, try to generate a TBL, which will at least be |
| // better than moving to/from gpr registers for larger vectors. |
| if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { |
| // Construct a mask for the tbl. We may need to adjust the index for types |
| // larger than i8. |
| SmallVector<unsigned, 16> Mask; |
| unsigned OutputFactor = VT.getScalarSizeInBits() / 8; |
| for (unsigned I = 0; I < NumElts; ++I) { |
| SDValue V = Op.getOperand(I); |
| if (V.isUndef()) { |
| for (unsigned OF = 0; OF < OutputFactor; OF++) |
| Mask.push_back(-1); |
| continue; |
| } |
| // Set the Mask lanes adjusted for the size of the input and output |
| // lanes. The Mask is always i8, so it will set OutputFactor lanes per |
| // output element, adjusted in their positions per input and output types. |
| unsigned Lane = V.getConstantOperandVal(1); |
| for (unsigned S = 0; S < Sources.size(); S++) { |
| if (V.getOperand(0) == Sources[S].Vec) { |
| unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); |
| unsigned InputBase = 16 * S + Lane * InputSize / 8; |
| for (unsigned OF = 0; OF < OutputFactor; OF++) |
| Mask.push_back(InputBase + OF); |
| break; |
| } |
| } |
| } |
| |
| // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to |
| // v16i8, and the TBLMask |
| SmallVector<SDValue, 16> TBLOperands; |
| TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 |
| ? Intrinsic::aarch64_neon_tbl3 |
| : Intrinsic::aarch64_neon_tbl4, |
| dl, MVT::i32)); |
| for (unsigned i = 0; i < Sources.size(); i++) { |
| SDValue Src = Sources[i].Vec; |
| EVT SrcVT = Src.getValueType(); |
| Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); |
| assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && |
| "Expected a legally typed vector"); |
| if (SrcVT.is64BitVector()) |
| Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, |
| DAG.getUNDEF(MVT::v8i8)); |
| TBLOperands.push_back(Src); |
| } |
| |
| SmallVector<SDValue, 16> TBLMask; |
| for (unsigned i = 0; i < Mask.size(); i++) |
| TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); |
| assert((Mask.size() == 8 || Mask.size() == 16) && |
| "Expected a v8i8 or v16i8 Mask"); |
| TBLOperands.push_back( |
| DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); |
| |
| SDValue Shuffle = |
| DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, |
| Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); |
| return DAG.getBitcast(VT, Shuffle); |
| } |
| |
| if (Sources.size() > 2) { |
| LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " |
| << "sensible when at most two source vectors are " |
| << "involved\n"); |
| return SDValue(); |
| } |
| |
| // Find out the smallest element size among result and two sources, and use |
| // it as element size to build the shuffle_vector. |
| EVT SmallestEltTy = VT.getVectorElementType(); |
| for (auto &Source : Sources) { |
| EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); |
| if (SrcEltTy.bitsLT(SmallestEltTy)) { |
| SmallestEltTy = SrcEltTy; |
| } |
| } |
| unsigned ResMultiplier = |
| VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); |
| EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); |
| |
| // If the source vector is too wide or too narrow, we may nevertheless be able |
| // to construct a compatible shuffle either by concatenating it with UNDEF or |
| // extracting a suitable range of elements. |
| for (auto &Src : Sources) { |
| EVT SrcVT = Src.ShuffleVec.getValueType(); |
| |
| TypeSize SrcVTSize = SrcVT.getSizeInBits(); |
| if (SrcVTSize == TypeSize::Fixed(VTSize)) |
| continue; |
| |
| // This stage of the search produces a source with the same element type as |
| // the original, but with a total width matching the BUILD_VECTOR output. |
| EVT EltVT = SrcVT.getVectorElementType(); |
| unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); |
| EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); |
| |
| if (SrcVTSize.getFixedValue() < VTSize) { |
| assert(2 * SrcVTSize == VTSize); |
| // We can pad out the smaller vector for free, so if it's part of a |
| // shuffle... |
| Src.ShuffleVec = |
| DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, |
| DAG.getUNDEF(Src.ShuffleVec.getValueType())); |
| continue; |
| } |
| |
| if (SrcVTSize.getFixedValue() != 2 * VTSize) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: result vector too small to extract\n"); |
| return SDValue(); |
| } |
| |
| if (Src.MaxElt - Src.MinElt >= NumSrcElts) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); |
| return SDValue(); |
| } |
| |
| if (Src.MinElt >= NumSrcElts) { |
| // The extraction can just take the second half |
| Src.ShuffleVec = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(NumSrcElts, dl, MVT::i64)); |
| Src.WindowBase = -NumSrcElts; |
| } else if (Src.MaxElt < NumSrcElts) { |
| // The extraction can just take the first half |
| Src.ShuffleVec = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(0, dl, MVT::i64)); |
| } else { |
| // An actual VEXT is needed |
| SDValue VEXTSrc1 = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(0, dl, MVT::i64)); |
| SDValue VEXTSrc2 = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(NumSrcElts, dl, MVT::i64)); |
| unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); |
| |
| if (!SrcVT.is64BitVector()) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " |
| "for SVE vectors."); |
| return SDValue(); |
| } |
| |
| Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, |
| VEXTSrc2, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| Src.WindowBase = -Src.MinElt; |
| } |
| } |
| |
| // Another possible incompatibility occurs from the vector element types. We |
| // can fix this by bitcasting the source vectors to the same type we intend |
| // for the shuffle. |
| for (auto &Src : Sources) { |
| EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); |
| if (SrcEltTy == SmallestEltTy) |
| continue; |
| assert(ShuffleVT.getVectorElementType() == SmallestEltTy); |
| Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); |
| Src.WindowScale = |
| SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); |
| Src.WindowBase *= Src.WindowScale; |
| } |
| |
| // Final check before we try to actually produce a shuffle. |
| LLVM_DEBUG(for (auto Src |
| : Sources) |
| assert(Src.ShuffleVec.getValueType() == ShuffleVT);); |
| |
| // The stars all align, our next step is to produce the mask for the shuffle. |
| SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); |
| int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); |
| for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { |
| SDValue Entry = Op.getOperand(i); |
| if (Entry.isUndef()) |
| continue; |
| |
| auto Src = find(Sources, Entry.getOperand(0)); |
| int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); |
| |
| // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit |
| // trunc. So only std::min(SrcBits, DestBits) actually get defined in this |
| // segment. |
| EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); |
| int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), |
| VT.getScalarSizeInBits()); |
| int LanesDefined = BitsDefined / BitsPerShuffleLane; |
| |
| // This source is expected to fill ResMultiplier lanes of the final shuffle, |
| // starting at the appropriate offset. |
| int *LaneMask = &Mask[i * ResMultiplier]; |
| |
| int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; |
| ExtractBase += NumElts * (Src - Sources.begin()); |
| for (int j = 0; j < LanesDefined; ++j) |
| LaneMask[j] = ExtractBase + j; |
| } |
| |
| // Final check before we try to produce nonsense... |
| if (!isShuffleMaskLegal(Mask, ShuffleVT)) { |
| LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); |
| return SDValue(); |
| } |
| |
| SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; |
| for (unsigned i = 0; i < Sources.size(); ++i) |
| ShuffleOps[i] = Sources[i].ShuffleVec; |
| |
| SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], |
| ShuffleOps[1], Mask); |
| SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); |
| |
| LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); |
| dbgs() << "Reshuffle, creating node: "; V.dump();); |
| |
| return V; |
| } |
| |
| // check if an EXT instruction can handle the shuffle mask when the |
| // vector sources of the shuffle are the same. |
| static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| // Assume that the first shuffle index is not UNDEF. Fail if it is. |
| if (M[0] < 0) |
| return false; |
| |
| Imm = M[0]; |
| |
| // If this is a VEXT shuffle, the immediate value is the index of the first |
| // element. The other shuffle indices must be the successive elements after |
| // the first one. |
| unsigned ExpectedElt = Imm; |
| for (unsigned i = 1; i < NumElts; ++i) { |
| // Increment the expected index. If it wraps around, just follow it |
| // back to index zero and keep going. |
| ++ExpectedElt; |
| if (ExpectedElt == NumElts) |
| ExpectedElt = 0; |
| |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if (ExpectedElt != static_cast<unsigned>(M[i])) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from |
| // v4i32s. This is really a truncate, which we can construct out of (legal) |
| // concats and truncate nodes. |
| static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { |
| if (V.getValueType() != MVT::v16i8) |
| return SDValue(); |
| assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); |
| |
| for (unsigned X = 0; X < 4; X++) { |
| // Check the first item in each group is an extract from lane 0 of a v4i32 |
| // or v4i16. |
| SDValue BaseExt = V.getOperand(X * 4); |
| if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && |
| BaseExt.getOperand(0).getValueType() != MVT::v4i32) || |
| !isa<ConstantSDNode>(BaseExt.getOperand(1)) || |
| BaseExt.getConstantOperandVal(1) != 0) |
| return SDValue(); |
| SDValue Base = BaseExt.getOperand(0); |
| // And check the other items are extracts from the same vector. |
| for (unsigned Y = 1; Y < 4; Y++) { |
| SDValue Ext = V.getOperand(X * 4 + Y); |
| if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| Ext.getOperand(0) != Base || |
| !isa<ConstantSDNode>(Ext.getOperand(1)) || |
| Ext.getConstantOperandVal(1) != Y) |
| return SDValue(); |
| } |
| } |
| |
| // Turn the buildvector into a series of truncates and concates, which will |
| // become uzip1's. Any v4i32s we found get truncated to v4i16, which are |
| // concat together to produce 2 v8i16. These are both truncated and concat |
| // together. |
| SDLoc DL(V); |
| SDValue Trunc[4] = { |
| V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), |
| V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; |
| for (SDValue &V : Trunc) |
| if (V.getValueType() == MVT::v4i32) |
| V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V); |
| SDValue Concat0 = |
| DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); |
| SDValue Concat1 = |
| DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); |
| SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); |
| SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); |
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); |
| } |
| |
| /// Check if a vector shuffle corresponds to a DUP instructions with a larger |
| /// element width than the vector lane type. If that is the case the function |
| /// returns true and writes the value of the DUP instruction lane operand into |
| /// DupLaneOp |
| static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, |
| unsigned &DupLaneOp) { |
| assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && |
| "Only possible block sizes for wide DUP are: 16, 32, 64"); |
| |
| if (BlockSize <= VT.getScalarSizeInBits()) |
| return false; |
| if (BlockSize % VT.getScalarSizeInBits() != 0) |
| return false; |
| if (VT.getSizeInBits() % BlockSize != 0) |
| return false; |
| |
| size_t SingleVecNumElements = VT.getVectorNumElements(); |
| size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); |
| size_t NumBlocks = VT.getSizeInBits() / BlockSize; |
| |
| // We are looking for masks like |
| // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element |
| // might be replaced by 'undefined'. BlockIndices will eventually contain |
| // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] |
| // for the above examples) |
| SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); |
| for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) |
| for (size_t I = 0; I < NumEltsPerBlock; I++) { |
| int Elt = M[BlockIndex * NumEltsPerBlock + I]; |
| if (Elt < 0) |
| continue; |
| // For now we don't support shuffles that use the second operand |
| if ((unsigned)Elt >= SingleVecNumElements) |
| return false; |
| if (BlockElts[I] < 0) |
| BlockElts[I] = Elt; |
| else if (BlockElts[I] != Elt) |
| return false; |
| } |
| |
| // We found a candidate block (possibly with some undefs). It must be a |
| // sequence of consecutive integers starting with a value divisible by |
| // NumEltsPerBlock with some values possibly replaced by undef-s. |
| |
| // Find first non-undef element |
| auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); |
| assert(FirstRealEltIter != BlockElts.end() && |
| "Shuffle with all-undefs must have been caught by previous cases, " |
| "e.g. isSplat()"); |
| if (FirstRealEltIter == BlockElts.end()) { |
| DupLaneOp = 0; |
| return true; |
| } |
| |
| // Index of FirstRealElt in BlockElts |
| size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); |
| |
| if ((unsigned)*FirstRealEltIter < FirstRealIndex) |
| return false; |
| // BlockElts[0] must have the following value if it isn't undef: |
| size_t Elt0 = *FirstRealEltIter - FirstRealIndex; |
| |
| // Check the first element |
| if (Elt0 % NumEltsPerBlock != 0) |
| return false; |
| // Check that the sequence indeed consists of consecutive integers (modulo |
| // undefs) |
| for (size_t I = 0; I < NumEltsPerBlock; I++) |
| if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) |
| return false; |
| |
| DupLaneOp = Elt0 / NumEltsPerBlock; |
| return true; |
| } |
| |
| // check if an EXT instruction can handle the shuffle mask when the |
| // vector sources of the shuffle are different. |
| static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, |
| unsigned &Imm) { |
| // Look for the first non-undef element. |
| const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); |
| |
| // Benefit form APInt to handle overflow when calculating expected element. |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); |
| APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); |
| // The following shuffle indices must be the successive elements after the |
| // first real element. |
| bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { |
| return Elt != ExpectedElt++ && Elt != -1; |
| }); |
| if (FoundWrongElt) |
| return false; |
| |
| // The index of an EXT is the first element if it is not UNDEF. |
| // Watch out for the beginning UNDEFs. The EXT index should be the expected |
| // value of the first element. E.g. |
| // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. |
| // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. |
| // ExpectedElt is the last mask index plus 1. |
| Imm = ExpectedElt.getZExtValue(); |
| |
| // There are two difference cases requiring to reverse input vectors. |
| // For example, for vector <4 x i32> we have the following cases, |
| // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) |
| // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) |
| // For both cases, we finally use mask <5, 6, 7, 0>, which requires |
| // to reverse two input vectors. |
| if (Imm < NumElts) |
| ReverseEXT = true; |
| else |
| Imm -= NumElts; |
| |
| return true; |
| } |
| |
| /// isREVMask - Check if a vector shuffle corresponds to a REV |
| /// instruction with the specified blocksize. (The order of the elements |
| /// within each block of the vector is reversed.) |
| static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { |
| assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 || |
| BlockSize == 128) && |
| "Only possible block sizes for REV are: 16, 32, 64, 128"); |
| |
| unsigned EltSz = VT.getScalarSizeInBits(); |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned BlockElts = M[0] + 1; |
| // If the first shuffle index is UNDEF, be optimistic. |
| if (M[0] < 0) |
| BlockElts = BlockSize / EltSz; |
| |
| if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) |
| return false; |
| |
| for (unsigned i = 0; i < NumElts; ++i) { |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| unsigned Idx = WhichResult * NumElts / 2; |
| for (unsigned i = 0; i != NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != Idx) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) |
| return false; |
| Idx += 1; |
| } |
| |
| return true; |
| } |
| |
| static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i != NumElts; ++i) { |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if ((unsigned)M[i] != 2 * i + WhichResult) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) |
| return false; |
| } |
| return true; |
| } |
| |
| /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. |
| static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| unsigned Idx = WhichResult * NumElts / 2; |
| for (unsigned i = 0; i != NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != Idx) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) |
| return false; |
| Idx += 1; |
| } |
| |
| return true; |
| } |
| |
| /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, |
| static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned Half = VT.getVectorNumElements() / 2; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned j = 0; j != 2; ++j) { |
| unsigned Idx = WhichResult; |
| for (unsigned i = 0; i != Half; ++i) { |
| int MIdx = M[i + j * Half]; |
| if (MIdx >= 0 && (unsigned)MIdx != Idx) |
| return false; |
| Idx += 2; |
| } |
| } |
| |
| return true; |
| } |
| |
| /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. |
| static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) |
| return false; |
| } |
| return true; |
| } |
| |
| static bool isINSMask(ArrayRef<int> M, int NumInputElements, |
| bool &DstIsLeft, int &Anomaly) { |
| if (M.size() != static_cast<size_t>(NumInputElements)) |
| return false; |
| |
| int NumLHSMatch = 0, NumRHSMatch = 0; |
| int LastLHSMismatch = -1, LastRHSMismatch = -1; |
| |
| for (int i = 0; i < NumInputElements; ++i) { |
| if (M[i] == -1) { |
| ++NumLHSMatch; |
| ++NumRHSMatch; |
| continue; |
| } |
| |
| if (M[i] == i) |
| ++NumLHSMatch; |
| else |
| LastLHSMismatch = i; |
| |
| if (M[i] == i + NumInputElements) |
| ++NumRHSMatch; |
| else |
| LastRHSMismatch = i; |
| } |
| |
| if (NumLHSMatch == NumInputElements - 1) { |
| DstIsLeft = true; |
| Anomaly = LastLHSMismatch; |
| return true; |
| } else if (NumRHSMatch == NumInputElements - 1) { |
| DstIsLeft = false; |
| Anomaly = LastRHSMismatch; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { |
| if (VT.getSizeInBits() != 128) |
| return false; |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| for (int I = 0, E = NumElts / 2; I != E; I++) { |
| if (Mask[I] != I) |
| return false; |
| } |
| |
| int Offset = NumElts / 2; |
| for (int I = NumElts / 2, E = NumElts; I != E; I++) { |
| if (Mask[I] != I + SplitLHS * Offset) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| SDValue V0 = Op.getOperand(0); |
| SDValue V1 = Op.getOperand(1); |
| ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); |
| |
| if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || |
| VT.getVectorElementType() != V1.getValueType().getVectorElementType()) |
| return SDValue(); |
| |
| bool SplitV0 = V0.getValueSizeInBits() == 128; |
| |
| if (!isConcatMask(Mask, VT, SplitV0)) |
| return SDValue(); |
| |
| EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); |
| if (SplitV0) { |
| V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| if (V1.getValueSizeInBits() == 128) { |
| V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); |
| } |
| |
| /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit |
| /// the specified operations to build the shuffle. ID is the perfect-shuffle |
| //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle |
| //table entry and LHS/RHS are the immediate inputs for this stage of the |
| //shuffle. |
| static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, |
| SDValue V2, unsigned PFEntry, SDValue LHS, |
| SDValue RHS, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| unsigned OpNum = (PFEntry >> 26) & 0x0F; |
| unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); |
| unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); |
| |
| enum { |
| OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> |
| OP_VREV, |
| OP_VDUP0, |
| OP_VDUP1, |
| OP_VDUP2, |
| OP_VDUP3, |
| OP_VEXT1, |
| OP_VEXT2, |
| OP_VEXT3, |
| OP_VUZPL, // VUZP, left result |
| OP_VUZPR, // VUZP, right result |
| OP_VZIPL, // VZIP, left result |
| OP_VZIPR, // VZIP, right result |
| OP_VTRNL, // VTRN, left result |
| OP_VTRNR, // VTRN, right result |
| OP_MOVLANE // Move lane. RHSID is the lane to move into |
| }; |
| |
| if (OpNum == OP_COPY) { |
| if (LHSID == (1 * 9 + 2) * 9 + 3) |
| return LHS; |
| assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); |
| return RHS; |
| } |
| |
| if (OpNum == OP_MOVLANE) { |
| // Decompose a PerfectShuffle ID to get the Mask for lane Elt |
| auto getPFIDLane = [](unsigned ID, int Elt) -> int { |
| assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); |
| Elt = 3 - Elt; |
| while (Elt > 0) { |
| ID /= 9; |
| Elt--; |
| } |
| return (ID % 9 == 8) ? -1 : ID % 9; |
| }; |
| |
| // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We |
| // get the lane to move from the PFID, which is always from the |
| // original vectors (V1 or V2). |
| SDValue OpLHS = GeneratePerfectShuffle( |
| LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); |
| EVT VT = OpLHS.getValueType(); |
| assert(RHSID < 8 && "Expected a lane index for RHSID!"); |
| unsigned ExtLane = 0; |
| SDValue Input; |
| |
| // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs |
| // convert into a higher type. |
| if (RHSID & 0x4) { |
| int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; |
| if (MaskElt == -1) |
| MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; |
| assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); |
| ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); |
| Input = MaskElt < 2 ? V1 : V2; |
| if (VT.getScalarSizeInBits() == 16) { |
| Input = DAG.getBitcast(MVT::v2f32, Input); |
| OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); |
| } else { |
| assert(VT.getScalarSizeInBits() == 32 && |
| "Expected 16 or 32 bit shuffle elemements"); |
| Input = DAG.getBitcast(MVT::v2f64, Input); |
| OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); |
| } |
| } else { |
| int MaskElt = getPFIDLane(ID, RHSID); |
| assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); |
| ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); |
| Input = MaskElt < 4 ? V1 : V2; |
| // Be careful about creating illegal types. Use f16 instead of i16. |
| if (VT == MVT::v4i16) { |
| Input = DAG.getBitcast(MVT::v4f16, Input); |
| OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); |
| } |
| } |
| SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, |
| Input.getValueType().getVectorElementType(), |
| Input, DAG.getVectorIdxConstant(ExtLane, dl)); |
| SDValue Ins = |
| DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, |
| Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); |
| return DAG.getBitcast(VT, Ins); |
| } |
| |
| SDValue OpLHS, OpRHS; |
| OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, |
| RHS, DAG, dl); |
| OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, |
| RHS, DAG, dl); |
| EVT VT = OpLHS.getValueType(); |
| |
| switch (OpNum) { |
| default: |
| llvm_unreachable("Unknown shuffle opcode!"); |
| case OP_VREV: |
| // VREV divides the vector in half and swaps within the half. |
| if (VT.getVectorElementType() == MVT::i32 || |
| VT.getVectorElementType() == MVT::f32) |
| return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); |
| // vrev <4 x i16> -> REV32 |
| if (VT.getVectorElementType() == MVT::i16 || |
| VT.getVectorElementType() == MVT::f16 || |
| VT.getVectorElementType() == MVT::bf16) |
| return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); |
| // vrev <4 x i8> -> REV16 |
| assert(VT.getVectorElementType() == MVT::i8); |
| return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); |
| case OP_VDUP0: |
| case OP_VDUP1: |
| case OP_VDUP2: |
| case OP_VDUP3: { |
| EVT EltTy = VT.getVectorElementType(); |
| unsigned Opcode; |
| if (EltTy == MVT::i8) |
| Opcode = AArch64ISD::DUPLANE8; |
| else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) |
| Opcode = AArch64ISD::DUPLANE16; |
| else if (EltTy == MVT::i32 || EltTy == MVT::f32) |
| Opcode = AArch64ISD::DUPLANE32; |
| else if (EltTy == MVT::i64 || EltTy == MVT::f64) |
| Opcode = AArch64ISD::DUPLANE64; |
| else |
| llvm_unreachable("Invalid vector element type?"); |
| |
| if (VT.getSizeInBits() == 64) |
| OpLHS = WidenVector(OpLHS, DAG); |
| SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); |
| return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); |
| } |
| case OP_VEXT1: |
| case OP_VEXT2: |
| case OP_VEXT3: { |
| unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); |
| return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } |
| case OP_VUZPL: |
| return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS); |
| case OP_VUZPR: |
| return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS); |
| case OP_VZIPL: |
| return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS); |
| case OP_VZIPR: |
| return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS); |
| case OP_VTRNL: |
| return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS); |
| case OP_VTRNR: |
| return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS); |
| } |
| } |
| |
| static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, |
| SelectionDAG &DAG) { |
| // Check to see if we can use the TBL instruction. |
| SDValue V1 = Op.getOperand(0); |
| SDValue V2 = Op.getOperand(1); |
| SDLoc DL(Op); |
| |
| EVT EltVT = Op.getValueType().getVectorElementType(); |
| unsigned BytesPerElt = EltVT.getSizeInBits() / 8; |
| |
| bool Swap = false; |
| if (V1.isUndef() || isZerosVector(V1.getNode())) { |
| std::swap(V1, V2); |
| Swap = true; |
| } |
| |
| // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill |
| // out of range values with 0s. We do need to make sure that any out-of-range |
| // values are really out-of-range for a v16i8 vector. |
| bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); |
| MVT IndexVT = MVT::v8i8; |
| unsigned IndexLen = 8; |
| if (Op.getValueSizeInBits() == 128) { |
| IndexVT = MVT::v16i8; |
| IndexLen = 16; |
| } |
| |
| SmallVector<SDValue, 8> TBLMask; |
| for (int Val : ShuffleMask) { |
| for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { |
| unsigned Offset = Byte + Val * BytesPerElt; |
| if (Swap) |
| Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; |
| if (IsUndefOrZero && Offset >= IndexLen) |
| Offset = 255; |
| TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); |
| } |
| } |
| |
| SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); |
| SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); |
| |
| SDValue Shuffle; |
| if (IsUndefOrZero) { |
| if (IndexLen == 8) |
| V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, |
| DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); |
| } else { |
| if (IndexLen == 8) { |
| V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, |
| DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); |
| } else { |
| // FIXME: We cannot, for the moment, emit a TBL2 instruction because we |
| // cannot currently represent the register constraints on the input |
| // table registers. |
| // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, |
| // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], |
| // IndexLen)); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, |
| V2Cst, |
| DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); |
| } |
| } |
| return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); |
| } |
| |
| static unsigned getDUPLANEOp(EVT EltType) { |
| if (EltType == MVT::i8) |
| return AArch64ISD::DUPLANE8; |
| if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) |
| return AArch64ISD::DUPLANE16; |
| if (EltType == MVT::i32 || EltType == MVT::f32) |
| return AArch64ISD::DUPLANE32; |
| if (EltType == MVT::i64 || EltType == MVT::f64) |
| return AArch64ISD::DUPLANE64; |
| |
| llvm_unreachable("Invalid vector element type?"); |
| } |
| |
| static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, |
| unsigned Opcode, SelectionDAG &DAG) { |
| // Try to eliminate a bitcasted extract subvector before a DUPLANE. |
| auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { |
| // Match: dup (bitcast (extract_subv X, C)), LaneC |
| if (BitCast.getOpcode() != ISD::BITCAST || |
| BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) |
| return false; |
| |
| // The extract index must align in the destination type. That may not |
| // happen if the bitcast is from narrow to wide type. |
| SDValue Extract = BitCast.getOperand(0); |
| unsigned ExtIdx = Extract.getConstantOperandVal(1); |
| unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); |
| unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; |
| unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); |
| if (ExtIdxInBits % CastedEltBitWidth != 0) |
| return false; |
| |
| // Can't handle cases where vector size is not 128-bit |
| if (!Extract.getOperand(0).getValueType().is128BitVector()) |
| return false; |
| |
| // Update the lane value by offsetting with the scaled extract index. |
| LaneC += ExtIdxInBits / CastedEltBitWidth; |
| |
| // Determine the casted vector type of the wide vector input. |
| // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' |
| // Examples: |
| // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 |
| // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 |
| unsigned SrcVecNumElts = |
| Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; |
| CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), |
| SrcVecNumElts); |
| return true; |
| }; |
| MVT CastVT; |
| if (getScaledOffsetDup(V, Lane, CastVT)) { |
| V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); |
| } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| V.getOperand(0).getValueType().is128BitVector()) { |
| // The lane is incremented by the index of the extract. |
| // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 |
| Lane += V.getConstantOperandVal(1); |
| V = V.getOperand(0); |
| } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { |
| // The lane is decremented if we are splatting from the 2nd operand. |
| // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 |
| unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; |
| Lane -= Idx * VT.getVectorNumElements() / 2; |
| V = WidenVector(V.getOperand(Idx), DAG); |
| } else if (VT.getSizeInBits() == 64) { |
| // Widen the operand to 128-bit register with undef. |
| V = WidenVector(V, DAG); |
| } |
| return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); |
| } |
| |
| // Return true if we can get a new shuffle mask by checking the parameter mask |
| // array to test whether every two adjacent mask values are continuous and |
| // starting from an even number. |
| static bool isWideTypeMask(ArrayRef<int> M, EVT VT, |
| SmallVectorImpl<int> &NewMask) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| |
| NewMask.clear(); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| int M0 = M[i]; |
| int M1 = M[i + 1]; |
| |
| // If both elements are undef, new mask is undef too. |
| if (M0 == -1 && M1 == -1) { |
| NewMask.push_back(-1); |
| continue; |
| } |
| |
| if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { |
| NewMask.push_back(M1 / 2); |
| continue; |
| } |
| |
| if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { |
| NewMask.push_back(M0 / 2); |
| continue; |
| } |
| |
| NewMask.clear(); |
| return false; |
| } |
| |
| assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); |
| return true; |
| } |
| |
| // Try to widen element type to get a new mask value for a better permutation |
| // sequence, so that we can use NEON shuffle instructions, such as zip1/2, |
| // UZP1/2, TRN1/2, REV, INS, etc. |
| // For example: |
| // shufflevector <4 x i32> %a, <4 x i32> %b, |
| // <4 x i32> <i32 6, i32 7, i32 2, i32 3> |
| // is equivalent to: |
| // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> |
| // Finally, we can get: |
| // mov v0.d[0], v1.d[1] |
| static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| EVT ScalarVT = VT.getVectorElementType(); |
| unsigned ElementSize = ScalarVT.getFixedSizeInBits(); |
| SDValue V0 = Op.getOperand(0); |
| SDValue V1 = Op.getOperand(1); |
| ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); |
| |
| // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... |
| // We need to make sure the wider element type is legal. Thus, ElementSize |
| // should be not larger than 32 bits, and i1 type should also be excluded. |
| if (ElementSize > 32 || ElementSize == 1) |
| return SDValue(); |
| |
| SmallVector<int, 8> NewMask; |
| if (isWideTypeMask(Mask, VT, NewMask)) { |
| MVT NewEltVT = VT.isFloatingPoint() |
| ? MVT::getFloatingPointVT(ElementSize * 2) |
| : MVT::getIntegerVT(ElementSize * 2); |
| MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); |
| if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { |
| V0 = DAG.getBitcast(NewVT, V0); |
| V1 = DAG.getBitcast(NewVT, V1); |
| return DAG.getBitcast(VT, |
| DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try to fold shuffle (tbl2, tbl2) into a single tbl4. |
| static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, |
| ArrayRef<int> ShuffleMask, |
| SelectionDAG &DAG) { |
| SDValue Tbl1 = Op->getOperand(0); |
| SDValue Tbl2 = Op->getOperand(1); |
| SDLoc dl(Op); |
| SDValue Tbl2ID = |
| DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); |
| |
| EVT VT = Op.getValueType(); |
| if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || |
| Tbl1->getOperand(0) != Tbl2ID || |
| Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || |
| Tbl2->getOperand(0) != Tbl2ID) |
| return SDValue(); |
| |
| if (Tbl1->getValueType(0) != MVT::v16i8 || |
| Tbl2->getValueType(0) != MVT::v16i8) |
| return SDValue(); |
| |
| SDValue Mask1 = Tbl1->getOperand(3); |
| SDValue Mask2 = Tbl2->getOperand(3); |
| SmallVector<SDValue, 16> TBLMaskParts(16, SDValue()); |
| for (unsigned I = 0; I < 16; I++) { |
| if (ShuffleMask[I] < 16) |
| TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); |
| else { |
| auto *C = |
| dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16)); |
| if (!C) |
| return SDValue(); |
| TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); |
| } |
| } |
| |
| SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); |
| SDValue ID = |
| DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); |
| |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, |
| {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), |
| Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); |
| } |
| |
| // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros, |
| // but we don't have an appropriate instruction, |
| // so custom-lower it as ZIP1-with-zeros. |
| SDValue |
| AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| SDValue SrcOp = Op.getOperand(0); |
| EVT SrcVT = SrcOp.getValueType(); |
| assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 && |
| "Unexpected extension factor."); |
| unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); |
| // FIXME: support multi-step zipping? |
| if (Scale != 2) |
| return SDValue(); |
| SDValue Zeros = DAG.getConstant(0, dl, SrcVT); |
| return DAG.getBitcast(VT, |
| DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| |
| ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); |
| |
| if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) |
| return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); |
| |
| // Convert shuffles that are directly supported on NEON to target-specific |
| // DAG nodes, instead of keeping them as shuffles and matching them again |
| // during code selection. This is more efficient and avoids the possibility |
| // of inconsistencies between legalization and selection. |
| ArrayRef<int> ShuffleMask = SVN->getMask(); |
| |
| SDValue V1 = Op.getOperand(0); |
| SDValue V2 = Op.getOperand(1); |
| |
| assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); |
| assert(ShuffleMask.size() == VT.getVectorNumElements() && |
| "Unexpected VECTOR_SHUFFLE mask size!"); |
| |
| if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) |
| return Res; |
| |
| if (SVN->isSplat()) { |
| int Lane = SVN->getSplatIndex(); |
| // If this is undef splat, generate it via "just" vdup, if possible. |
| if (Lane == -1) |
| Lane = 0; |
| |
| if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) |
| return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), |
| V1.getOperand(0)); |
| // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- |
| // constant. If so, we can just reference the lane's definition directly. |
| if (V1.getOpcode() == ISD::BUILD_VECTOR && |
| !isa<ConstantSDNode>(V1.getOperand(Lane))) |
| return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); |
| |
| // Otherwise, duplicate from the lane of the input vector. |
| unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); |
| return constructDup(V1, Lane, dl, VT, Opcode, DAG); |
| } |
| |
| // Check if the mask matches a DUP for a wider element |
| for (unsigned LaneSize : {64U, 32U, 16U}) { |
| unsigned Lane = 0; |
| if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { |
| unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 |
| : LaneSize == 32 ? AArch64ISD::DUPLANE32 |
| : AArch64ISD::DUPLANE16; |
| // Cast V1 to an integer vector with required lane size |
| MVT NewEltTy = MVT::getIntegerVT(LaneSize); |
| unsigned NewEltCount = VT.getSizeInBits() / LaneSize; |
| MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); |
| V1 = DAG.getBitcast(NewVecTy, V1); |
| // Constuct the DUP instruction |
| V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); |
| // Cast back to the original type |
| return DAG.getBitcast(VT, V1); |
| } |
| } |
| |
| if (isREVMask(ShuffleMask, VT, 64)) |
| return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); |
| if (isREVMask(ShuffleMask, VT, 32)) |
| return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); |
| if (isREVMask(ShuffleMask, VT, 16)) |
| return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); |
| |
| if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || |
| (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && |
| ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) { |
| SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, |
| DAG.getConstant(8, dl, MVT::i32)); |
| } |
| |
| bool ReverseEXT = false; |
| unsigned Imm; |
| if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { |
| if (ReverseEXT) |
| std::swap(V1, V2); |
| Imm *= getExtFactor(V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { |
| Imm *= getExtFactor(V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } |
| |
| unsigned WhichResult; |
| if (isZIPMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| if (isUZPMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| if (isTRNMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| |
| if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| |
| if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) |
| return Concat; |
| |
| bool DstIsLeft; |
| int Anomaly; |
| int NumInputElements = V1.getValueType().getVectorNumElements(); |
| if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { |
| SDValue DstVec = DstIsLeft ? V1 : V2; |
| SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); |
| |
| SDValue SrcVec = V1; |
| int SrcLane = ShuffleMask[Anomaly]; |
| if (SrcLane >= NumInputElements) { |
| SrcVec = V2; |
| SrcLane -= VT.getVectorNumElements(); |
| } |
| SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); |
| |
| EVT ScalarVT = VT.getVectorElementType(); |
| |
| if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) |
| ScalarVT = MVT::i32; |
| |
| return DAG.getNode( |
| ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, |
| DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), |
| DstLaneV); |
| } |
| |
| if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) |
| return NewSD; |
| |
| // If the shuffle is not directly supported and it has 4 elements, use |
| // the PerfectShuffle-generated table to synthesize it from other shuffles. |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts == 4) { |
| unsigned PFIndexes[4]; |
| for (unsigned i = 0; i != 4; ++i) { |
| if (ShuffleMask[i] < 0) |
| PFIndexes[i] = 8; |
| else |
| PFIndexes[i] = ShuffleMask[i]; |
| } |
| |
| // Compute the index in the perfect shuffle table. |
| unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + |
| PFIndexes[2] * 9 + PFIndexes[3]; |
| unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; |
| return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, |
| dl); |
| } |
| |
| return GenerateTBL(Op, ShuffleMask, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) |
| return LowerToScalableOp(Op, DAG); |
| |
| assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 && |
| "Unexpected vector type!"); |
| |
| // We can handle the constant cases during isel. |
| if (isa<ConstantSDNode>(Op.getOperand(0))) |
| return Op; |
| |
| // There isn't a natural way to handle the general i1 case, so we use some |
| // trickery with whilelo. |
| SDLoc DL(Op); |
| SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64); |
| SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal, |
| DAG.getValueType(MVT::i1)); |
| SDValue ID = |
| DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); |
| SDValue Zero = DAG.getConstant(0, DL, MVT::i64); |
| if (VT == MVT::nxv1i1) |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1, |
| DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID, |
| Zero, SplatVal), |
| Zero); |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal); |
| } |
| |
| SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| |
| EVT VT = Op.getValueType(); |
| if (!isTypeLegal(VT) || !VT.isScalableVector()) |
| return SDValue(); |
| |
| // Current lowering only supports the SVE-ACLE types. |
| if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) |
| return SDValue(); |
| |
| // The DUPQ operation is indepedent of element type so normalise to i64s. |
| SDValue Idx128 = Op.getOperand(2); |
| |
| // DUPQ can be used when idx is in range. |
| auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); |
| if (CIdx && (CIdx->getZExtValue() <= 3)) { |
| SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); |
| return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI); |
| } |
| |
| SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); |
| |
| // The ACLE says this must produce the same result as: |
| // svtbl(data, svadd_x(svptrue_b64(), |
| // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), |
| // index * 2)) |
| SDValue One = DAG.getConstant(1, DL, MVT::i64); |
| SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); |
| |
| // create the vector 0,1,0,1,... |
| SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); |
| SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); |
| |
| // create the vector idx64,idx64+1,idx64,idx64+1,... |
| SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); |
| SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); |
| SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); |
| |
| // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... |
| SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); |
| return DAG.getNode(ISD::BITCAST, DL, VT, TBL); |
| } |
| |
| |
| static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, |
| APInt &UndefBits) { |
| EVT VT = BVN->getValueType(0); |
| APInt SplatBits, SplatUndef; |
| unsigned SplatBitSize; |
| bool HasAnyUndefs; |
| if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { |
| unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; |
| |
| for (unsigned i = 0; i < NumSplats; ++i) { |
| CnstBits <<= SplatBitSize; |
| UndefBits <<= SplatBitSize; |
| CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); |
| UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); |
| } |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // Try 64-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; |
| |
| if (AArch64_AM::isAdvSIMDModImmType10(Value)) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); |
| |
| SDLoc dl(Op); |
| SDValue Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32)); |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 32-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits, |
| const SDValue *LHS = nullptr) { |
| EVT VT = Op.getValueType(); |
| if (VT.isFixedLengthVector() && |
| !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()) |
| return SDValue(); |
| |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; |
| bool isAdvSIMDModImm = false; |
| uint64_t Shift; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); |
| Shift = 0; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); |
| Shift = 8; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); |
| Shift = 16; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); |
| Shift = 24; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov; |
| |
| if (LHS) |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| else |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 16-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits, |
| const SDValue *LHS = nullptr) { |
| EVT VT = Op.getValueType(); |
| if (VT.isFixedLengthVector() && |
| !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()) |
| return SDValue(); |
| |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; |
| bool isAdvSIMDModImm = false; |
| uint64_t Shift; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); |
| Shift = 0; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); |
| Shift = 8; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov; |
| |
| if (LHS) |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| else |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 32-bit splatted SIMD immediate with shifted ones. |
| static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, |
| SelectionDAG &DAG, const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; |
| bool isAdvSIMDModImm = false; |