| //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the AArch64TargetLowering class. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AArch64ISelLowering.h" |
| #include "AArch64CallingConvention.h" |
| #include "AArch64ExpandImm.h" |
| #include "AArch64MachineFunctionInfo.h" |
| #include "AArch64PerfectShuffle.h" |
| #include "AArch64RegisterInfo.h" |
| #include "AArch64Subtarget.h" |
| #include "MCTargetDesc/AArch64AddressingModes.h" |
| #include "Utils/AArch64BaseInfo.h" |
| #include "llvm/ADT/APFloat.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Triple.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/Analysis/ObjCARCUtil.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/CodeGen/Analysis.h" |
| #include "llvm/CodeGen/CallingConvLower.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineInstr.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineMemOperand.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/RuntimeLibcalls.h" |
| #include "llvm/CodeGen/SelectionDAG.h" |
| #include "llvm/CodeGen/SelectionDAGNodes.h" |
| #include "llvm/CodeGen/TargetCallingConv.h" |
| #include "llvm/CodeGen/TargetInstrInfo.h" |
| #include "llvm/CodeGen/ValueTypes.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/GetElementPtrTypeIterator.h" |
| #include "llvm/IR/GlobalValue.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/IntrinsicsAArch64.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/OperandTraits.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/MC/MCRegisterInfo.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CodeGen.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MachineValueType.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include <algorithm> |
| #include <bitset> |
| #include <cassert> |
| #include <cctype> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <iterator> |
| #include <limits> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| using namespace llvm; |
| using namespace llvm::PatternMatch; |
| |
| #define DEBUG_TYPE "aarch64-lower" |
| |
| STATISTIC(NumTailCalls, "Number of tail calls"); |
| STATISTIC(NumShiftInserts, "Number of vector shift inserts"); |
| STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); |
| |
| // FIXME: The necessary dtprel relocations don't seem to be supported |
| // well in the GNU bfd and gold linkers at the moment. Therefore, by |
| // default, for now, fall back to GeneralDynamic code generation. |
| cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( |
| "aarch64-elf-ldtls-generation", cl::Hidden, |
| cl::desc("Allow AArch64 Local Dynamic TLS code generation"), |
| cl::init(false)); |
| |
| static cl::opt<bool> |
| EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, |
| cl::desc("Enable AArch64 logical imm instruction " |
| "optimization"), |
| cl::init(true)); |
| |
| // Temporary option added for the purpose of testing functionality added |
| // to DAGCombiner.cpp in D92230. It is expected that this can be removed |
| // in future when both implementations will be based off MGATHER rather |
| // than the GLD1 nodes added for the SVE gather load intrinsics. |
| static cl::opt<bool> |
| EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, |
| cl::desc("Combine extends of AArch64 masked " |
| "gather intrinsics"), |
| cl::init(true)); |
| |
| /// Value type used for condition codes. |
| static const MVT MVT_CC = MVT::i32; |
| |
| static inline EVT getPackedSVEVectorVT(EVT VT) { |
| switch (VT.getSimpleVT().SimpleTy) { |
| default: |
| llvm_unreachable("unexpected element type for vector"); |
| case MVT::i8: |
| return MVT::nxv16i8; |
| case MVT::i16: |
| return MVT::nxv8i16; |
| case MVT::i32: |
| return MVT::nxv4i32; |
| case MVT::i64: |
| return MVT::nxv2i64; |
| case MVT::f16: |
| return MVT::nxv8f16; |
| case MVT::f32: |
| return MVT::nxv4f32; |
| case MVT::f64: |
| return MVT::nxv2f64; |
| case MVT::bf16: |
| return MVT::nxv8bf16; |
| } |
| } |
| |
| // NOTE: Currently there's only a need to return integer vector types. If this |
| // changes then just add an extra "type" parameter. |
| static inline EVT getPackedSVEVectorVT(ElementCount EC) { |
| switch (EC.getKnownMinValue()) { |
| default: |
| llvm_unreachable("unexpected element count for vector"); |
| case 16: |
| return MVT::nxv16i8; |
| case 8: |
| return MVT::nxv8i16; |
| case 4: |
| return MVT::nxv4i32; |
| case 2: |
| return MVT::nxv2i64; |
| } |
| } |
| |
| static inline EVT getPromotedVTForPredicate(EVT VT) { |
| assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && |
| "Expected scalable predicate vector type!"); |
| switch (VT.getVectorMinNumElements()) { |
| default: |
| llvm_unreachable("unexpected element count for vector"); |
| case 2: |
| return MVT::nxv2i64; |
| case 4: |
| return MVT::nxv4i32; |
| case 8: |
| return MVT::nxv8i16; |
| case 16: |
| return MVT::nxv16i8; |
| } |
| } |
| |
| /// Returns true if VT's elements occupy the lowest bit positions of its |
| /// associated register class without any intervening space. |
| /// |
| /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the |
| /// same register class, but only nxv8f16 can be treated as a packed vector. |
| static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { |
| assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && |
| "Expected legal vector type!"); |
| return VT.isFixedLengthVector() || |
| VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; |
| } |
| |
| // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading |
| // predicate and end with a passthru value matching the result type. |
| static bool isMergePassthruOpcode(unsigned Opc) { |
| switch (Opc) { |
| default: |
| return false; |
| case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: |
| case AArch64ISD::BSWAP_MERGE_PASSTHRU: |
| case AArch64ISD::CTLZ_MERGE_PASSTHRU: |
| case AArch64ISD::CTPOP_MERGE_PASSTHRU: |
| case AArch64ISD::DUP_MERGE_PASSTHRU: |
| case AArch64ISD::ABS_MERGE_PASSTHRU: |
| case AArch64ISD::NEG_MERGE_PASSTHRU: |
| case AArch64ISD::FNEG_MERGE_PASSTHRU: |
| case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: |
| case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: |
| case AArch64ISD::FCEIL_MERGE_PASSTHRU: |
| case AArch64ISD::FFLOOR_MERGE_PASSTHRU: |
| case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: |
| case AArch64ISD::FRINT_MERGE_PASSTHRU: |
| case AArch64ISD::FROUND_MERGE_PASSTHRU: |
| case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: |
| case AArch64ISD::FTRUNC_MERGE_PASSTHRU: |
| case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: |
| case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: |
| case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: |
| case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: |
| case AArch64ISD::FCVTZU_MERGE_PASSTHRU: |
| case AArch64ISD::FCVTZS_MERGE_PASSTHRU: |
| case AArch64ISD::FSQRT_MERGE_PASSTHRU: |
| case AArch64ISD::FRECPX_MERGE_PASSTHRU: |
| case AArch64ISD::FABS_MERGE_PASSTHRU: |
| return true; |
| } |
| } |
| |
| AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, |
| const AArch64Subtarget &STI) |
| : TargetLowering(TM), Subtarget(&STI) { |
| // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so |
| // we have to make something up. Arbitrarily, choose ZeroOrOne. |
| setBooleanContents(ZeroOrOneBooleanContent); |
| // When comparing vectors the result sets the different elements in the |
| // vector to all-one or all-zero. |
| setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
| |
| // Set up the register classes. |
| addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); |
| addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); |
| |
| if (Subtarget->hasLS64()) { |
| addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); |
| setOperationAction(ISD::LOAD, MVT::i64x8, Custom); |
| setOperationAction(ISD::STORE, MVT::i64x8, Custom); |
| } |
| |
| if (Subtarget->hasFPARMv8()) { |
| addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); |
| addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); |
| addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); |
| addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); |
| addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); |
| } |
| |
| if (Subtarget->hasNEON()) { |
| addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); |
| addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); |
| // Someone set us up the NEON. |
| addDRTypeForNEON(MVT::v2f32); |
| addDRTypeForNEON(MVT::v8i8); |
| addDRTypeForNEON(MVT::v4i16); |
| addDRTypeForNEON(MVT::v2i32); |
| addDRTypeForNEON(MVT::v1i64); |
| addDRTypeForNEON(MVT::v1f64); |
| addDRTypeForNEON(MVT::v4f16); |
| if (Subtarget->hasBF16()) |
| addDRTypeForNEON(MVT::v4bf16); |
| |
| addQRTypeForNEON(MVT::v4f32); |
| addQRTypeForNEON(MVT::v2f64); |
| addQRTypeForNEON(MVT::v16i8); |
| addQRTypeForNEON(MVT::v8i16); |
| addQRTypeForNEON(MVT::v4i32); |
| addQRTypeForNEON(MVT::v2i64); |
| addQRTypeForNEON(MVT::v8f16); |
| if (Subtarget->hasBF16()) |
| addQRTypeForNEON(MVT::v8bf16); |
| } |
| |
| if (Subtarget->hasSVE()) { |
| // Add legal sve predicate types |
| addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); |
| addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); |
| |
| // Add legal sve data types |
| addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); |
| |
| addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); |
| |
| if (Subtarget->hasBF16()) { |
| addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); |
| addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); |
| } |
| |
| if (Subtarget->useSVEForFixedLengthVectors()) { |
| for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addRegisterClass(VT, &AArch64::ZPRRegClass); |
| |
| for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addRegisterClass(VT, &AArch64::ZPRRegClass); |
| } |
| |
| for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { |
| setOperationAction(ISD::SADDSAT, VT, Legal); |
| setOperationAction(ISD::UADDSAT, VT, Legal); |
| setOperationAction(ISD::SSUBSAT, VT, Legal); |
| setOperationAction(ISD::USUBSAT, VT, Legal); |
| setOperationAction(ISD::UREM, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::SDIVREM, VT, Expand); |
| setOperationAction(ISD::UDIVREM, VT, Expand); |
| } |
| |
| for (auto VT : |
| { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, |
| MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); |
| |
| for (auto VT : |
| { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, |
| MVT::nxv2f64 }) { |
| setCondCodeAction(ISD::SETO, VT, Expand); |
| setCondCodeAction(ISD::SETOLT, VT, Expand); |
| setCondCodeAction(ISD::SETLT, VT, Expand); |
| setCondCodeAction(ISD::SETOLE, VT, Expand); |
| setCondCodeAction(ISD::SETLE, VT, Expand); |
| setCondCodeAction(ISD::SETULT, VT, Expand); |
| setCondCodeAction(ISD::SETULE, VT, Expand); |
| setCondCodeAction(ISD::SETUGE, VT, Expand); |
| setCondCodeAction(ISD::SETUGT, VT, Expand); |
| setCondCodeAction(ISD::SETUEQ, VT, Expand); |
| setCondCodeAction(ISD::SETUNE, VT, Expand); |
| |
| setOperationAction(ISD::FREM, VT, Expand); |
| setOperationAction(ISD::FPOW, VT, Expand); |
| setOperationAction(ISD::FPOWI, VT, Expand); |
| setOperationAction(ISD::FCOS, VT, Expand); |
| setOperationAction(ISD::FSIN, VT, Expand); |
| setOperationAction(ISD::FSINCOS, VT, Expand); |
| setOperationAction(ISD::FEXP, VT, Expand); |
| setOperationAction(ISD::FEXP2, VT, Expand); |
| setOperationAction(ISD::FLOG, VT, Expand); |
| setOperationAction(ISD::FLOG2, VT, Expand); |
| setOperationAction(ISD::FLOG10, VT, Expand); |
| } |
| } |
| |
| // Compute derived properties from the register classes |
| computeRegisterProperties(Subtarget->getRegisterInfo()); |
| |
| // Provide all sorts of operation actions |
| setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); |
| setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); |
| setOperationAction(ISD::SETCC, MVT::i32, Custom); |
| setOperationAction(ISD::SETCC, MVT::i64, Custom); |
| setOperationAction(ISD::SETCC, MVT::f16, Custom); |
| setOperationAction(ISD::SETCC, MVT::f32, Custom); |
| setOperationAction(ISD::SETCC, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); |
| setOperationAction(ISD::BRCOND, MVT::Other, Expand); |
| setOperationAction(ISD::BR_CC, MVT::i32, Custom); |
| setOperationAction(ISD::BR_CC, MVT::i64, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f16, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f32, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f64, Custom); |
| setOperationAction(ISD::SELECT, MVT::i32, Custom); |
| setOperationAction(ISD::SELECT, MVT::i64, Custom); |
| setOperationAction(ISD::SELECT, MVT::f16, Custom); |
| setOperationAction(ISD::SELECT, MVT::f32, Custom); |
| setOperationAction(ISD::SELECT, MVT::f64, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); |
| setOperationAction(ISD::BR_JT, MVT::Other, Custom); |
| setOperationAction(ISD::JumpTable, MVT::i64, Custom); |
| |
| setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); |
| setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); |
| setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); |
| |
| setOperationAction(ISD::FREM, MVT::f32, Expand); |
| setOperationAction(ISD::FREM, MVT::f64, Expand); |
| setOperationAction(ISD::FREM, MVT::f80, Expand); |
| |
| setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); |
| |
| // Custom lowering hooks are needed for XOR |
| // to fold it into CSINC/CSINV. |
| setOperationAction(ISD::XOR, MVT::i32, Custom); |
| setOperationAction(ISD::XOR, MVT::i64, Custom); |
| |
| // Virtually no operation on f128 is legal, but LLVM can't expand them when |
| // there's a valid register class, so we need custom operations in most cases. |
| setOperationAction(ISD::FABS, MVT::f128, Expand); |
| setOperationAction(ISD::FADD, MVT::f128, LibCall); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); |
| setOperationAction(ISD::FCOS, MVT::f128, Expand); |
| setOperationAction(ISD::FDIV, MVT::f128, LibCall); |
| setOperationAction(ISD::FMA, MVT::f128, Expand); |
| setOperationAction(ISD::FMUL, MVT::f128, LibCall); |
| setOperationAction(ISD::FNEG, MVT::f128, Expand); |
| setOperationAction(ISD::FPOW, MVT::f128, Expand); |
| setOperationAction(ISD::FREM, MVT::f128, Expand); |
| setOperationAction(ISD::FRINT, MVT::f128, Expand); |
| setOperationAction(ISD::FSIN, MVT::f128, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f128, Expand); |
| setOperationAction(ISD::FSQRT, MVT::f128, Expand); |
| setOperationAction(ISD::FSUB, MVT::f128, LibCall); |
| setOperationAction(ISD::FTRUNC, MVT::f128, Expand); |
| setOperationAction(ISD::SETCC, MVT::f128, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); |
| setOperationAction(ISD::BR_CC, MVT::f128, Custom); |
| setOperationAction(ISD::SELECT, MVT::f128, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); |
| setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); |
| |
| // Lowering for many of the conversions is actually specified by the non-f128 |
| // type. The LowerXXX function will be trivial when f128 isn't involved. |
| setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); |
| |
| setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); |
| setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); |
| |
| // Variable arguments. |
| setOperationAction(ISD::VASTART, MVT::Other, Custom); |
| setOperationAction(ISD::VAARG, MVT::Other, Custom); |
| setOperationAction(ISD::VACOPY, MVT::Other, Custom); |
| setOperationAction(ISD::VAEND, MVT::Other, Expand); |
| |
| // Variable-sized objects. |
| setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); |
| setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); |
| |
| if (Subtarget->isTargetWindows()) |
| setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); |
| else |
| setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); |
| |
| // Constant pool entries |
| setOperationAction(ISD::ConstantPool, MVT::i64, Custom); |
| |
| // BlockAddress |
| setOperationAction(ISD::BlockAddress, MVT::i64, Custom); |
| |
| // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. |
| setOperationAction(ISD::ADDC, MVT::i32, Custom); |
| setOperationAction(ISD::ADDE, MVT::i32, Custom); |
| setOperationAction(ISD::SUBC, MVT::i32, Custom); |
| setOperationAction(ISD::SUBE, MVT::i32, Custom); |
| setOperationAction(ISD::ADDC, MVT::i64, Custom); |
| setOperationAction(ISD::ADDE, MVT::i64, Custom); |
| setOperationAction(ISD::SUBC, MVT::i64, Custom); |
| setOperationAction(ISD::SUBE, MVT::i64, Custom); |
| |
| // AArch64 lacks both left-rotate and popcount instructions. |
| setOperationAction(ISD::ROTL, MVT::i32, Expand); |
| setOperationAction(ISD::ROTL, MVT::i64, Expand); |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::ROTR, VT, Expand); |
| } |
| |
| // AArch64 doesn't have i32 MULH{S|U}. |
| setOperationAction(ISD::MULHU, MVT::i32, Expand); |
| setOperationAction(ISD::MULHS, MVT::i32, Expand); |
| |
| // AArch64 doesn't have {U|S}MUL_LOHI. |
| setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); |
| setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); |
| |
| setOperationAction(ISD::CTPOP, MVT::i32, Custom); |
| setOperationAction(ISD::CTPOP, MVT::i64, Custom); |
| setOperationAction(ISD::CTPOP, MVT::i128, Custom); |
| |
| setOperationAction(ISD::ABS, MVT::i32, Custom); |
| setOperationAction(ISD::ABS, MVT::i64, Custom); |
| |
| setOperationAction(ISD::SDIVREM, MVT::i32, Expand); |
| setOperationAction(ISD::SDIVREM, MVT::i64, Expand); |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::SDIVREM, VT, Expand); |
| setOperationAction(ISD::UDIVREM, VT, Expand); |
| } |
| setOperationAction(ISD::SREM, MVT::i32, Expand); |
| setOperationAction(ISD::SREM, MVT::i64, Expand); |
| setOperationAction(ISD::UDIVREM, MVT::i32, Expand); |
| setOperationAction(ISD::UDIVREM, MVT::i64, Expand); |
| setOperationAction(ISD::UREM, MVT::i32, Expand); |
| setOperationAction(ISD::UREM, MVT::i64, Expand); |
| |
| // Custom lower Add/Sub/Mul with overflow. |
| setOperationAction(ISD::SADDO, MVT::i32, Custom); |
| setOperationAction(ISD::SADDO, MVT::i64, Custom); |
| setOperationAction(ISD::UADDO, MVT::i32, Custom); |
| setOperationAction(ISD::UADDO, MVT::i64, Custom); |
| setOperationAction(ISD::SSUBO, MVT::i32, Custom); |
| setOperationAction(ISD::SSUBO, MVT::i64, Custom); |
| setOperationAction(ISD::USUBO, MVT::i32, Custom); |
| setOperationAction(ISD::USUBO, MVT::i64, Custom); |
| setOperationAction(ISD::SMULO, MVT::i32, Custom); |
| setOperationAction(ISD::SMULO, MVT::i64, Custom); |
| setOperationAction(ISD::UMULO, MVT::i32, Custom); |
| setOperationAction(ISD::UMULO, MVT::i64, Custom); |
| |
| setOperationAction(ISD::FSIN, MVT::f32, Expand); |
| setOperationAction(ISD::FSIN, MVT::f64, Expand); |
| setOperationAction(ISD::FCOS, MVT::f32, Expand); |
| setOperationAction(ISD::FCOS, MVT::f64, Expand); |
| setOperationAction(ISD::FPOW, MVT::f32, Expand); |
| setOperationAction(ISD::FPOW, MVT::f64, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); |
| if (Subtarget->hasFullFP16()) |
| setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); |
| else |
| setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); |
| |
| setOperationAction(ISD::FREM, MVT::f16, Promote); |
| setOperationAction(ISD::FREM, MVT::v4f16, Expand); |
| setOperationAction(ISD::FREM, MVT::v8f16, Expand); |
| setOperationAction(ISD::FPOW, MVT::f16, Promote); |
| setOperationAction(ISD::FPOW, MVT::v4f16, Expand); |
| setOperationAction(ISD::FPOW, MVT::v8f16, Expand); |
| setOperationAction(ISD::FPOWI, MVT::f16, Promote); |
| setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); |
| setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); |
| setOperationAction(ISD::FCOS, MVT::f16, Promote); |
| setOperationAction(ISD::FCOS, MVT::v4f16, Expand); |
| setOperationAction(ISD::FCOS, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSIN, MVT::f16, Promote); |
| setOperationAction(ISD::FSIN, MVT::v4f16, Expand); |
| setOperationAction(ISD::FSIN, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f16, Promote); |
| setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); |
| setOperationAction(ISD::FEXP, MVT::f16, Promote); |
| setOperationAction(ISD::FEXP, MVT::v4f16, Expand); |
| setOperationAction(ISD::FEXP, MVT::v8f16, Expand); |
| setOperationAction(ISD::FEXP2, MVT::f16, Promote); |
| setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); |
| setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); |
| setOperationAction(ISD::FLOG, MVT::f16, Promote); |
| setOperationAction(ISD::FLOG, MVT::v4f16, Expand); |
| setOperationAction(ISD::FLOG, MVT::v8f16, Expand); |
| setOperationAction(ISD::FLOG2, MVT::f16, Promote); |
| setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); |
| setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); |
| setOperationAction(ISD::FLOG10, MVT::f16, Promote); |
| setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); |
| setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); |
| |
| if (!Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::SELECT, MVT::f16, Promote); |
| setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); |
| setOperationAction(ISD::SETCC, MVT::f16, Promote); |
| setOperationAction(ISD::BR_CC, MVT::f16, Promote); |
| setOperationAction(ISD::FADD, MVT::f16, Promote); |
| setOperationAction(ISD::FSUB, MVT::f16, Promote); |
| setOperationAction(ISD::FMUL, MVT::f16, Promote); |
| setOperationAction(ISD::FDIV, MVT::f16, Promote); |
| setOperationAction(ISD::FMA, MVT::f16, Promote); |
| setOperationAction(ISD::FNEG, MVT::f16, Promote); |
| setOperationAction(ISD::FABS, MVT::f16, Promote); |
| setOperationAction(ISD::FCEIL, MVT::f16, Promote); |
| setOperationAction(ISD::FSQRT, MVT::f16, Promote); |
| setOperationAction(ISD::FFLOOR, MVT::f16, Promote); |
| setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); |
| setOperationAction(ISD::FRINT, MVT::f16, Promote); |
| setOperationAction(ISD::FROUND, MVT::f16, Promote); |
| setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); |
| setOperationAction(ISD::FTRUNC, MVT::f16, Promote); |
| setOperationAction(ISD::FMINNUM, MVT::f16, Promote); |
| setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); |
| setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); |
| setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); |
| |
| // promote v4f16 to v4f32 when that is known to be safe. |
| setOperationAction(ISD::FADD, MVT::v4f16, Promote); |
| setOperationAction(ISD::FSUB, MVT::v4f16, Promote); |
| setOperationAction(ISD::FMUL, MVT::v4f16, Promote); |
| setOperationAction(ISD::FDIV, MVT::v4f16, Promote); |
| AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); |
| AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); |
| AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); |
| AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); |
| |
| setOperationAction(ISD::FABS, MVT::v4f16, Expand); |
| setOperationAction(ISD::FNEG, MVT::v4f16, Expand); |
| setOperationAction(ISD::FROUND, MVT::v4f16, Expand); |
| setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); |
| setOperationAction(ISD::FMA, MVT::v4f16, Expand); |
| setOperationAction(ISD::SETCC, MVT::v4f16, Expand); |
| setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); |
| setOperationAction(ISD::SELECT, MVT::v4f16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); |
| setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); |
| setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); |
| setOperationAction(ISD::FRINT, MVT::v4f16, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); |
| setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); |
| |
| setOperationAction(ISD::FABS, MVT::v8f16, Expand); |
| setOperationAction(ISD::FADD, MVT::v8f16, Expand); |
| setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); |
| setOperationAction(ISD::FDIV, MVT::v8f16, Expand); |
| setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); |
| setOperationAction(ISD::FMA, MVT::v8f16, Expand); |
| setOperationAction(ISD::FMUL, MVT::v8f16, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FNEG, MVT::v8f16, Expand); |
| setOperationAction(ISD::FROUND, MVT::v8f16, Expand); |
| setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); |
| setOperationAction(ISD::FRINT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); |
| setOperationAction(ISD::FSUB, MVT::v8f16, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); |
| setOperationAction(ISD::SETCC, MVT::v8f16, Expand); |
| setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); |
| setOperationAction(ISD::SELECT, MVT::v8f16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); |
| setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); |
| } |
| |
| // AArch64 has implementations of a lot of rounding-like FP operations. |
| for (MVT Ty : {MVT::f32, MVT::f64}) { |
| setOperationAction(ISD::FFLOOR, Ty, Legal); |
| setOperationAction(ISD::FNEARBYINT, Ty, Legal); |
| setOperationAction(ISD::FCEIL, Ty, Legal); |
| setOperationAction(ISD::FRINT, Ty, Legal); |
| setOperationAction(ISD::FTRUNC, Ty, Legal); |
| setOperationAction(ISD::FROUND, Ty, Legal); |
| setOperationAction(ISD::FROUNDEVEN, Ty, Legal); |
| setOperationAction(ISD::FMINNUM, Ty, Legal); |
| setOperationAction(ISD::FMAXNUM, Ty, Legal); |
| setOperationAction(ISD::FMINIMUM, Ty, Legal); |
| setOperationAction(ISD::FMAXIMUM, Ty, Legal); |
| setOperationAction(ISD::LROUND, Ty, Legal); |
| setOperationAction(ISD::LLROUND, Ty, Legal); |
| setOperationAction(ISD::LRINT, Ty, Legal); |
| setOperationAction(ISD::LLRINT, Ty, Legal); |
| } |
| |
| if (Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); |
| setOperationAction(ISD::FFLOOR, MVT::f16, Legal); |
| setOperationAction(ISD::FCEIL, MVT::f16, Legal); |
| setOperationAction(ISD::FRINT, MVT::f16, Legal); |
| setOperationAction(ISD::FTRUNC, MVT::f16, Legal); |
| setOperationAction(ISD::FROUND, MVT::f16, Legal); |
| setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); |
| setOperationAction(ISD::FMINNUM, MVT::f16, Legal); |
| setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); |
| setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); |
| setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); |
| } |
| |
| setOperationAction(ISD::PREFETCH, MVT::Other, Custom); |
| |
| setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); |
| setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); |
| |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); |
| |
| // Generate outline atomics library calls only if LSE was not specified for |
| // subtarget |
| if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); |
| #define LCALLNAMES(A, B, N) \ |
| setLibcallName(A##N##_RELAX, #B #N "_relax"); \ |
| setLibcallName(A##N##_ACQ, #B #N "_acq"); \ |
| setLibcallName(A##N##_REL, #B #N "_rel"); \ |
| setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); |
| #define LCALLNAME4(A, B) \ |
| LCALLNAMES(A, B, 1) \ |
| LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) |
| #define LCALLNAME5(A, B) \ |
| LCALLNAMES(A, B, 1) \ |
| LCALLNAMES(A, B, 2) \ |
| LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) |
| LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) |
| LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) |
| #undef LCALLNAMES |
| #undef LCALLNAME4 |
| #undef LCALLNAME5 |
| } |
| |
| // 128-bit loads and stores can be done without expanding |
| setOperationAction(ISD::LOAD, MVT::i128, Custom); |
| setOperationAction(ISD::STORE, MVT::i128, Custom); |
| |
| // Aligned 128-bit loads and stores are single-copy atomic according to the |
| // v8.4a spec. |
| if (Subtarget->hasLSE2()) { |
| setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); |
| setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); |
| } |
| |
| // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the |
| // custom lowering, as there are no un-paired non-temporal stores and |
| // legalization will break up 256 bit inputs. |
| setOperationAction(ISD::STORE, MVT::v32i8, Custom); |
| setOperationAction(ISD::STORE, MVT::v16i16, Custom); |
| setOperationAction(ISD::STORE, MVT::v16f16, Custom); |
| setOperationAction(ISD::STORE, MVT::v8i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v8f32, Custom); |
| setOperationAction(ISD::STORE, MVT::v4f64, Custom); |
| setOperationAction(ISD::STORE, MVT::v4i64, Custom); |
| |
| // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. |
| // This requires the Performance Monitors extension. |
| if (Subtarget->hasPerfMon()) |
| setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); |
| |
| if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && |
| getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { |
| // Issue __sincos_stret if available. |
| setOperationAction(ISD::FSINCOS, MVT::f64, Custom); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Custom); |
| } else { |
| setOperationAction(ISD::FSINCOS, MVT::f64, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Expand); |
| } |
| |
| if (Subtarget->getTargetTriple().isOSMSVCRT()) { |
| // MSVCRT doesn't have powi; fall back to pow |
| setLibcallName(RTLIB::POWI_F32, nullptr); |
| setLibcallName(RTLIB::POWI_F64, nullptr); |
| } |
| |
| // Make floating-point constants legal for the large code model, so they don't |
| // become loads from the constant pool. |
| if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { |
| setOperationAction(ISD::ConstantFP, MVT::f32, Legal); |
| setOperationAction(ISD::ConstantFP, MVT::f64, Legal); |
| } |
| |
| // AArch64 does not have floating-point extending loads, i1 sign-extending |
| // load, floating-point truncating stores, or v2i32->v2i16 truncating store. |
| for (MVT VT : MVT::fp_valuetypes()) { |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); |
| } |
| for (MVT VT : MVT::integer_valuetypes()) |
| setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); |
| |
| setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f64, MVT::f32, Expand); |
| setTruncStoreAction(MVT::f64, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f80, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f64, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f32, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f16, Expand); |
| |
| setOperationAction(ISD::BITCAST, MVT::i16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::f16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::bf16, Custom); |
| |
| // Indexed loads and stores are supported. |
| for (unsigned im = (unsigned)ISD::PRE_INC; |
| im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { |
| setIndexedLoadAction(im, MVT::i8, Legal); |
| setIndexedLoadAction(im, MVT::i16, Legal); |
| setIndexedLoadAction(im, MVT::i32, Legal); |
| setIndexedLoadAction(im, MVT::i64, Legal); |
| setIndexedLoadAction(im, MVT::f64, Legal); |
| setIndexedLoadAction(im, MVT::f32, Legal); |
| setIndexedLoadAction(im, MVT::f16, Legal); |
| setIndexedLoadAction(im, MVT::bf16, Legal); |
| setIndexedStoreAction(im, MVT::i8, Legal); |
| setIndexedStoreAction(im, MVT::i16, Legal); |
| setIndexedStoreAction(im, MVT::i32, Legal); |
| setIndexedStoreAction(im, MVT::i64, Legal); |
| setIndexedStoreAction(im, MVT::f64, Legal); |
| setIndexedStoreAction(im, MVT::f32, Legal); |
| setIndexedStoreAction(im, MVT::f16, Legal); |
| setIndexedStoreAction(im, MVT::bf16, Legal); |
| } |
| |
| // Trap. |
| setOperationAction(ISD::TRAP, MVT::Other, Legal); |
| setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); |
| setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); |
| |
| // We combine OR nodes for bitfield operations. |
| setTargetDAGCombine(ISD::OR); |
| // Try to create BICs for vector ANDs. |
| setTargetDAGCombine(ISD::AND); |
| |
| // Vector add and sub nodes may conceal a high-half opportunity. |
| // Also, try to fold ADD into CSINC/CSINV.. |
| setTargetDAGCombine(ISD::ADD); |
| setTargetDAGCombine(ISD::ABS); |
| setTargetDAGCombine(ISD::SUB); |
| setTargetDAGCombine(ISD::XOR); |
| setTargetDAGCombine(ISD::SINT_TO_FP); |
| setTargetDAGCombine(ISD::UINT_TO_FP); |
| |
| setTargetDAGCombine(ISD::FP_TO_SINT); |
| setTargetDAGCombine(ISD::FP_TO_UINT); |
| setTargetDAGCombine(ISD::FP_TO_SINT_SAT); |
| setTargetDAGCombine(ISD::FP_TO_UINT_SAT); |
| setTargetDAGCombine(ISD::FDIV); |
| |
| // Try and combine setcc with csel |
| setTargetDAGCombine(ISD::SETCC); |
| |
| setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); |
| |
| setTargetDAGCombine(ISD::ANY_EXTEND); |
| setTargetDAGCombine(ISD::ZERO_EXTEND); |
| setTargetDAGCombine(ISD::SIGN_EXTEND); |
| setTargetDAGCombine(ISD::VECTOR_SPLICE); |
| setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); |
| setTargetDAGCombine(ISD::TRUNCATE); |
| setTargetDAGCombine(ISD::CONCAT_VECTORS); |
| setTargetDAGCombine(ISD::INSERT_SUBVECTOR); |
| setTargetDAGCombine(ISD::STORE); |
| if (Subtarget->supportsAddressTopByteIgnored()) |
| setTargetDAGCombine(ISD::LOAD); |
| |
| setTargetDAGCombine(ISD::MUL); |
| |
| setTargetDAGCombine(ISD::SELECT); |
| setTargetDAGCombine(ISD::VSELECT); |
| |
| setTargetDAGCombine(ISD::INTRINSIC_VOID); |
| setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); |
| setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); |
| setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); |
| setTargetDAGCombine(ISD::VECREDUCE_ADD); |
| setTargetDAGCombine(ISD::STEP_VECTOR); |
| |
| setTargetDAGCombine(ISD::GlobalAddress); |
| |
| // In case of strict alignment, avoid an excessive number of byte wide stores. |
| MaxStoresPerMemsetOptSize = 8; |
| MaxStoresPerMemset = Subtarget->requiresStrictAlign() |
| ? MaxStoresPerMemsetOptSize : 32; |
| |
| MaxGluedStoresPerMemcpy = 4; |
| MaxStoresPerMemcpyOptSize = 4; |
| MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() |
| ? MaxStoresPerMemcpyOptSize : 16; |
| |
| MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; |
| |
| MaxLoadsPerMemcmpOptSize = 4; |
| MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() |
| ? MaxLoadsPerMemcmpOptSize : 8; |
| |
| setStackPointerRegisterToSaveRestore(AArch64::SP); |
| |
| setSchedulingPreference(Sched::Hybrid); |
| |
| EnableExtLdPromotion = true; |
| |
| // Set required alignment. |
| setMinFunctionAlignment(Align(4)); |
| // Set preferred alignments. |
| setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); |
| setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); |
| |
| // Only change the limit for entries in a jump table if specified by |
| // the sub target, but not at the command line. |
| unsigned MaxJT = STI.getMaximumJumpTableSize(); |
| if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) |
| setMaximumJumpTableSize(MaxJT); |
| |
| setHasExtractBitsInsn(true); |
| |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); |
| |
| if (Subtarget->hasNEON()) { |
| // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to |
| // silliness like this: |
| setOperationAction(ISD::FABS, MVT::v1f64, Expand); |
| setOperationAction(ISD::FADD, MVT::v1f64, Expand); |
| setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); |
| setOperationAction(ISD::FCOS, MVT::v1f64, Expand); |
| setOperationAction(ISD::FDIV, MVT::v1f64, Expand); |
| setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); |
| setOperationAction(ISD::FMA, MVT::v1f64, Expand); |
| setOperationAction(ISD::FMUL, MVT::v1f64, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); |
| setOperationAction(ISD::FNEG, MVT::v1f64, Expand); |
| setOperationAction(ISD::FPOW, MVT::v1f64, Expand); |
| setOperationAction(ISD::FREM, MVT::v1f64, Expand); |
| setOperationAction(ISD::FROUND, MVT::v1f64, Expand); |
| setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand); |
| setOperationAction(ISD::FRINT, MVT::v1f64, Expand); |
| setOperationAction(ISD::FSIN, MVT::v1f64, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); |
| setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); |
| setOperationAction(ISD::FSUB, MVT::v1f64, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); |
| setOperationAction(ISD::SETCC, MVT::v1f64, Expand); |
| setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); |
| setOperationAction(ISD::SELECT, MVT::v1f64, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); |
| setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); |
| |
| setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); |
| |
| setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand); |
| setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand); |
| |
| setOperationAction(ISD::MUL, MVT::v1i64, Expand); |
| |
| // AArch64 doesn't have a direct vector ->f32 conversion instructions for |
| // elements smaller than i32, so promote the input to i32 first. |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); |
| |
| // Similarly, there is no direct i32 -> f64 vector conversion instruction. |
| setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); |
| // Or, direct i32 -> f16 vector conversion. Set it so custom, so the |
| // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 |
| setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); |
| |
| if (Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); |
| } else { |
| // when AArch64 doesn't have fullfp16 support, promote the input |
| // to i32 first. |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); |
| setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); |
| setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); |
| } |
| |
| setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); |
| setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); |
| setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); |
| for (auto VT : {MVT::v1i64, MVT::v2i64}) { |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| } |
| |
| // AArch64 doesn't have MUL.2d: |
| setOperationAction(ISD::MUL, MVT::v2i64, Expand); |
| // Custom handling for some quad-vector types to detect MULL. |
| setOperationAction(ISD::MUL, MVT::v8i16, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i32, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i64, Custom); |
| |
| // Saturates |
| for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, |
| MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SADDSAT, VT, Legal); |
| setOperationAction(ISD::UADDSAT, VT, Legal); |
| setOperationAction(ISD::SSUBSAT, VT, Legal); |
| setOperationAction(ISD::USUBSAT, VT, Legal); |
| } |
| |
| for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, |
| MVT::v4i32}) { |
| setOperationAction(ISD::ABDS, VT, Legal); |
| setOperationAction(ISD::ABDU, VT, Legal); |
| } |
| |
| // Vector reductions |
| for (MVT VT : { MVT::v4f16, MVT::v2f32, |
| MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { |
| if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); |
| |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); |
| } |
| } |
| for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, |
| MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| } |
| setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); |
| |
| setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); |
| // Likewise, narrowing and extending vector loads/stores aren't handled |
| // directly. |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); |
| |
| if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { |
| setOperationAction(ISD::MULHS, VT, Legal); |
| setOperationAction(ISD::MULHU, VT, Legal); |
| } else { |
| setOperationAction(ISD::MULHS, VT, Expand); |
| setOperationAction(ISD::MULHU, VT, Expand); |
| } |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| |
| setOperationAction(ISD::BSWAP, VT, Expand); |
| setOperationAction(ISD::CTTZ, VT, Expand); |
| |
| for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { |
| setTruncStoreAction(VT, InnerVT, Expand); |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); |
| } |
| } |
| |
| // AArch64 has implementations of a lot of rounding-like FP operations. |
| for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { |
| setOperationAction(ISD::FFLOOR, Ty, Legal); |
| setOperationAction(ISD::FNEARBYINT, Ty, Legal); |
| setOperationAction(ISD::FCEIL, Ty, Legal); |
| setOperationAction(ISD::FRINT, Ty, Legal); |
| setOperationAction(ISD::FTRUNC, Ty, Legal); |
| setOperationAction(ISD::FROUND, Ty, Legal); |
| setOperationAction(ISD::FROUNDEVEN, Ty, Legal); |
| } |
| |
| if (Subtarget->hasFullFP16()) { |
| for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { |
| setOperationAction(ISD::FFLOOR, Ty, Legal); |
| setOperationAction(ISD::FNEARBYINT, Ty, Legal); |
| setOperationAction(ISD::FCEIL, Ty, Legal); |
| setOperationAction(ISD::FRINT, Ty, Legal); |
| setOperationAction(ISD::FTRUNC, Ty, Legal); |
| setOperationAction(ISD::FROUND, Ty, Legal); |
| setOperationAction(ISD::FROUNDEVEN, Ty, Legal); |
| } |
| } |
| |
| if (Subtarget->hasSVE()) |
| setOperationAction(ISD::VSCALE, MVT::i32, Custom); |
| |
| setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); |
| |
| setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); |
| } |
| |
| if (Subtarget->hasSVE()) { |
| for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::BSWAP, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::MULHS, VT, Custom); |
| setOperationAction(ISD::MULHU, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::ABS, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::ROTR, VT, Expand); |
| } |
| |
| // Illegal unpacked integer vector types. |
| for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| } |
| |
| // Legalize unpacked bitcasts to REINTERPRET_CAST. |
| for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, |
| MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) |
| setOperationAction(ISD::BITCAST, VT, Custom); |
| |
| for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| |
| // There are no legal MVT::nxv16f## based types. |
| if (VT != MVT::nxv16i1) { |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| } |
| } |
| |
| // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does |
| for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, |
| MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, |
| MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MSTORE, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| } |
| |
| for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { |
| for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) { |
| // Avoid marking truncating FP stores as legal to prevent the |
| // DAGCombiner from creating unsupported truncating stores. |
| setTruncStoreAction(VT, InnerVT, Expand); |
| // SVE does not have floating-point extending loads. |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); |
| } |
| } |
| |
| // SVE supports truncating stores of 64 and 128-bit vectors |
| setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); |
| |
| for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, |
| MVT::nxv4f32, MVT::nxv2f64}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| setOperationAction(ISD::FDIV, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Custom); |
| setOperationAction(ISD::FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::FMAXNUM, VT, Custom); |
| setOperationAction(ISD::FMINIMUM, VT, Custom); |
| setOperationAction(ISD::FMINNUM, VT, Custom); |
| setOperationAction(ISD::FMUL, VT, Custom); |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FSUB, VT, Custom); |
| setOperationAction(ISD::FCEIL, VT, Custom); |
| setOperationAction(ISD::FFLOOR, VT, Custom); |
| setOperationAction(ISD::FNEARBYINT, VT, Custom); |
| setOperationAction(ISD::FRINT, VT, Custom); |
| setOperationAction(ISD::FROUND, VT, Custom); |
| setOperationAction(ISD::FROUNDEVEN, VT, Custom); |
| setOperationAction(ISD::FTRUNC, VT, Custom); |
| setOperationAction(ISD::FSQRT, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FP_EXTEND, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| } |
| |
| for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| } |
| |
| setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); |
| |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); |
| |
| // NOTE: Currently this has to happen after computeRegisterProperties rather |
| // than the preferred option of combining it with the addRegisterClass call. |
| if (Subtarget->useSVEForFixedLengthVectors()) { |
| for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addTypeForFixedLengthSVE(VT); |
| for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) |
| if (useSVEForFixedLengthVectorVT(VT)) |
| addTypeForFixedLengthSVE(VT); |
| |
| // 64bit results can mean a bigger than NEON input. |
| for (auto VT : {MVT::v8i8, MVT::v4i16}) |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); |
| |
| // 128bit results imply a bigger than NEON input. |
| for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| for (auto VT : {MVT::v8f16, MVT::v4f32}) |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| |
| // These operations are not supported on NEON but SVE can do them. |
| setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); |
| setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v1i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i64, Custom); |
| setOperationAction(ISD::MULHS, MVT::v1i64, Custom); |
| setOperationAction(ISD::MULHS, MVT::v2i64, Custom); |
| setOperationAction(ISD::MULHU, MVT::v1i64, Custom); |
| setOperationAction(ISD::MULHU, MVT::v2i64, Custom); |
| setOperationAction(ISD::SDIV, MVT::v8i8, Custom); |
| setOperationAction(ISD::SDIV, MVT::v16i8, Custom); |
| setOperationAction(ISD::SDIV, MVT::v4i16, Custom); |
| setOperationAction(ISD::SDIV, MVT::v8i16, Custom); |
| setOperationAction(ISD::SDIV, MVT::v2i32, Custom); |
| setOperationAction(ISD::SDIV, MVT::v4i32, Custom); |
| setOperationAction(ISD::SDIV, MVT::v1i64, Custom); |
| setOperationAction(ISD::SDIV, MVT::v2i64, Custom); |
| setOperationAction(ISD::SMAX, MVT::v1i64, Custom); |
| setOperationAction(ISD::SMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::SMIN, MVT::v1i64, Custom); |
| setOperationAction(ISD::SMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::UDIV, MVT::v8i8, Custom); |
| setOperationAction(ISD::UDIV, MVT::v16i8, Custom); |
| setOperationAction(ISD::UDIV, MVT::v4i16, Custom); |
| setOperationAction(ISD::UDIV, MVT::v8i16, Custom); |
| setOperationAction(ISD::UDIV, MVT::v2i32, Custom); |
| setOperationAction(ISD::UDIV, MVT::v4i32, Custom); |
| setOperationAction(ISD::UDIV, MVT::v1i64, Custom); |
| setOperationAction(ISD::UDIV, MVT::v2i64, Custom); |
| setOperationAction(ISD::UMAX, MVT::v1i64, Custom); |
| setOperationAction(ISD::UMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::UMIN, MVT::v1i64, Custom); |
| setOperationAction(ISD::UMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); |
| |
| // Int operations with no NEON support. |
| for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, |
| MVT::v2i32, MVT::v4i32, MVT::v2i64}) { |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| } |
| |
| // FP operations with no NEON support. |
| for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, |
| MVT::v1f64, MVT::v2f64}) |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); |
| |
| // Use SVE for vectors with more than 2 elements. |
| for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| } |
| |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); |
| setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); |
| } |
| |
| PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); |
| } |
| |
| void AArch64TargetLowering::addTypeForNEON(MVT VT) { |
| assert(VT.isVector() && "VT should be a vector type"); |
| |
| if (VT.isFloatingPoint()) { |
| MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); |
| setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); |
| setOperationPromotedToType(ISD::STORE, VT, PromoteTo); |
| } |
| |
| // Mark vector float intrinsics as expand. |
| if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { |
| setOperationAction(ISD::FSIN, VT, Expand); |
| setOperationAction(ISD::FCOS, VT, Expand); |
| setOperationAction(ISD::FPOW, VT, Expand); |
| setOperationAction(ISD::FLOG, VT, Expand); |
| setOperationAction(ISD::FLOG2, VT, Expand); |
| setOperationAction(ISD::FLOG10, VT, Expand); |
| setOperationAction(ISD::FEXP, VT, Expand); |
| setOperationAction(ISD::FEXP2, VT, Expand); |
| } |
| |
| // But we do support custom-lowering for FCOPYSIGN. |
| if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || |
| ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::OR, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); |
| |
| setOperationAction(ISD::SELECT, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| setOperationAction(ISD::VSELECT, VT, Expand); |
| for (MVT InnerVT : MVT::all_valuetypes()) |
| setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); |
| |
| // CNT supports only B element sizes, then use UADDLP to widen. |
| if (VT != MVT::v8i8 && VT != MVT::v16i8) |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| |
| setOperationAction(ISD::UDIV, VT, Expand); |
| setOperationAction(ISD::SDIV, VT, Expand); |
| setOperationAction(ISD::UREM, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::FREM, VT, Expand); |
| |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); |
| |
| if (!VT.isFloatingPoint()) |
| setOperationAction(ISD::ABS, VT, Legal); |
| |
| // [SU][MIN|MAX] are available for all NEON types apart from i64. |
| if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) |
| for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) |
| setOperationAction(Opcode, VT, Legal); |
| |
| // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. |
| if (VT.isFloatingPoint() && |
| VT.getVectorElementType() != MVT::bf16 && |
| (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) |
| for (unsigned Opcode : |
| {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) |
| setOperationAction(Opcode, VT, Legal); |
| |
| if (Subtarget->isLittleEndian()) { |
| for (unsigned im = (unsigned)ISD::PRE_INC; |
| im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { |
| setIndexedLoadAction(im, VT, Legal); |
| setIndexedStoreAction(im, VT, Legal); |
| } |
| } |
| } |
| |
| bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, |
| EVT OpVT) const { |
| // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). |
| if (!Subtarget->hasSVE()) |
| return true; |
| |
| // We can only support legal predicate result types. |
| if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && |
| ResVT != MVT::nxv16i1) |
| return true; |
| |
| // The whilelo instruction only works with i32 or i64 scalar inputs. |
| if (OpVT != MVT::i32 && OpVT != MVT::i64) |
| return true; |
| |
| return false; |
| } |
| |
| void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { |
| assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); |
| |
| // By default everything must be expanded. |
| for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) |
| setOperationAction(Op, VT, Expand); |
| |
| // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| |
| if (VT.isFloatingPoint()) { |
| setCondCodeAction(ISD::SETO, VT, Expand); |
| setCondCodeAction(ISD::SETOLT, VT, Expand); |
| setCondCodeAction(ISD::SETLT, VT, Expand); |
| setCondCodeAction(ISD::SETOLE, VT, Expand); |
| setCondCodeAction(ISD::SETLE, VT, Expand); |
| setCondCodeAction(ISD::SETULT, VT, Expand); |
| setCondCodeAction(ISD::SETULE, VT, Expand); |
| setCondCodeAction(ISD::SETUGE, VT, Expand); |
| setCondCodeAction(ISD::SETUGT, VT, Expand); |
| setCondCodeAction(ISD::SETUEQ, VT, Expand); |
| setCondCodeAction(ISD::SETUNE, VT, Expand); |
| } |
| |
| // Mark integer truncating stores/extending loads as having custom lowering |
| if (VT.isInteger()) { |
| MVT InnerVT = VT.changeVectorElementType(MVT::i8); |
| while (InnerVT != VT) { |
| setTruncStoreAction(VT, InnerVT, Custom); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); |
| setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); |
| InnerVT = InnerVT.changeVectorElementType( |
| MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); |
| } |
| } |
| |
| // Mark floating-point truncating stores/extending loads as having custom |
| // lowering |
| if (VT.isFloatingPoint()) { |
| MVT InnerVT = VT.changeVectorElementType(MVT::f16); |
| while (InnerVT != VT) { |
| setTruncStoreAction(VT, InnerVT, Custom); |
| setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); |
| InnerVT = InnerVT.changeVectorElementType( |
| MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); |
| } |
| } |
| |
| // Lower fixed length vector operations to scalable equivalents. |
| setOperationAction(ISD::ABS, VT, Custom); |
| setOperationAction(ISD::ADD, VT, Custom); |
| setOperationAction(ISD::AND, VT, Custom); |
| setOperationAction(ISD::ANY_EXTEND, VT, Custom); |
| setOperationAction(ISD::BITCAST, VT, Custom); |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| setOperationAction(ISD::BSWAP, VT, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTTZ, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::FCEIL, VT, Custom); |
| setOperationAction(ISD::FDIV, VT, Custom); |
| setOperationAction(ISD::FFLOOR, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Custom); |
| setOperationAction(ISD::FMAXIMUM, VT, Custom); |
| setOperationAction(ISD::FMAXNUM, VT, Custom); |
| setOperationAction(ISD::FMINIMUM, VT, Custom); |
| setOperationAction(ISD::FMINNUM, VT, Custom); |
| setOperationAction(ISD::FMUL, VT, Custom); |
| setOperationAction(ISD::FNEARBYINT, VT, Custom); |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FP_EXTEND, VT, Custom); |
| setOperationAction(ISD::FP_ROUND, VT, Custom); |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::FRINT, VT, Custom); |
| setOperationAction(ISD::FROUND, VT, Custom); |
| setOperationAction(ISD::FROUNDEVEN, VT, Custom); |
| setOperationAction(ISD::FSQRT, VT, Custom); |
| setOperationAction(ISD::FSUB, VT, Custom); |
| setOperationAction(ISD::FTRUNC, VT, Custom); |
| setOperationAction(ISD::LOAD, VT, Custom); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| setOperationAction(ISD::MSTORE, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::MULHS, VT, Custom); |
| setOperationAction(ISD::MULHU, VT, Custom); |
| setOperationAction(ISD::OR, VT, Custom); |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, VT, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); |
| setOperationAction(ISD::SINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Custom); |
| setOperationAction(ISD::SMIN, VT, Custom); |
| setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::STORE, VT, Custom); |
| setOperationAction(ISD::SUB, VT, Custom); |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, Custom); |
| setOperationAction(ISD::UMAX, VT, Custom); |
| setOperationAction(ISD::UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_AND, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_OR, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); |
| setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::XOR, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, VT, Custom); |
| } |
| |
| void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { |
| addRegisterClass(VT, &AArch64::FPR64RegClass); |
| addTypeForNEON(VT); |
| } |
| |
| void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { |
| addRegisterClass(VT, &AArch64::FPR128RegClass); |
| addTypeForNEON(VT); |
| } |
| |
| EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, |
| LLVMContext &C, EVT VT) const { |
| if (!VT.isVector()) |
| return MVT::i32; |
| if (VT.isScalableVector()) |
| return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); |
| return VT.changeVectorElementTypeToInteger(); |
| } |
| |
| static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, |
| const APInt &Demanded, |
| TargetLowering::TargetLoweringOpt &TLO, |
| unsigned NewOpc) { |
| uint64_t OldImm = Imm, NewImm, Enc; |
| uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; |
| |
| // Return if the immediate is already all zeros, all ones, a bimm32 or a |
| // bimm64. |
| if (Imm == 0 || Imm == Mask || |
| AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) |
| return false; |
| |
| unsigned EltSize = Size; |
| uint64_t DemandedBits = Demanded.getZExtValue(); |
| |
| // Clear bits that are not demanded. |
| Imm &= DemandedBits; |
| |
| while (true) { |
| // The goal here is to set the non-demanded bits in a way that minimizes |
| // the number of switching between 0 and 1. In order to achieve this goal, |
| // we set the non-demanded bits to the value of the preceding demanded bits. |
| // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a |
| // non-demanded bit), we copy bit0 (1) to the least significant 'x', |
| // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. |
| // The final result is 0b11000011. |
| uint64_t NonDemandedBits = ~DemandedBits; |
| uint64_t InvertedImm = ~Imm & DemandedBits; |
| uint64_t RotatedImm = |
| ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & |
| NonDemandedBits; |
| uint64_t Sum = RotatedImm + NonDemandedBits; |
| bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); |
| uint64_t Ones = (Sum + Carry) & NonDemandedBits; |
| NewImm = (Imm | Ones) & Mask; |
| |
| // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate |
| // or all-ones or all-zeros, in which case we can stop searching. Otherwise, |
| // we halve the element size and continue the search. |
| if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) |
| break; |
| |
| // We cannot shrink the element size any further if it is 2-bits. |
| if (EltSize == 2) |
| return false; |
| |
| EltSize /= 2; |
| Mask >>= EltSize; |
| uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; |
| |
| // Return if there is mismatch in any of the demanded bits of Imm and Hi. |
| if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) |
| return false; |
| |
| // Merge the upper and lower halves of Imm and DemandedBits. |
| Imm |= Hi; |
| DemandedBits |= DemandedBitsHi; |
| } |
| |
| ++NumOptimizedImms; |
| |
| // Replicate the element across the register width. |
| while (EltSize < Size) { |
| NewImm |= NewImm << EltSize; |
| EltSize *= 2; |
| } |
| |
| (void)OldImm; |
| assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && |
| "demanded bits should never be altered"); |
| assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); |
| |
| // Create the new constant immediate node. |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| SDValue New; |
| |
| // If the new constant immediate is all-zeros or all-ones, let the target |
| // independent DAG combine optimize this node. |
| if (NewImm == 0 || NewImm == OrigMask) { |
| New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), |
| TLO.DAG.getConstant(NewImm, DL, VT)); |
| // Otherwise, create a machine node so that target independent DAG combine |
| // doesn't undo this optimization. |
| } else { |
| Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); |
| SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); |
| New = SDValue( |
| TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); |
| } |
| |
| return TLO.CombineTo(Op, New); |
| } |
| |
| bool AArch64TargetLowering::targetShrinkDemandedConstant( |
| SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, |
| TargetLoweringOpt &TLO) const { |
| // Delay this optimization to as late as possible. |
| if (!TLO.LegalOps) |
| return false; |
| |
| if (!EnableOptimizeLogicalImm) |
| return false; |
| |
| EVT VT = Op.getValueType(); |
| if (VT.isVector()) |
| return false; |
| |
| unsigned Size = VT.getSizeInBits(); |
| assert((Size == 32 || Size == 64) && |
| "i32 or i64 is expected after legalization."); |
| |
| // Exit early if we demand all bits. |
| if (DemandedBits.countPopulation() == Size) |
| return false; |
| |
| unsigned NewOpc; |
| switch (Op.getOpcode()) { |
| default: |
| return false; |
| case ISD::AND: |
| NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; |
| break; |
| case ISD::OR: |
| NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; |
| break; |
| case ISD::XOR: |
| NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; |
| break; |
| } |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); |
| if (!C) |
| return false; |
| uint64_t Imm = C->getZExtValue(); |
| return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); |
| } |
| |
| /// computeKnownBitsForTargetNode - Determine which of the bits specified in |
| /// Mask are known to be either zero or one and return them Known. |
| void AArch64TargetLowering::computeKnownBitsForTargetNode( |
| const SDValue Op, KnownBits &Known, |
| const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { |
| switch (Op.getOpcode()) { |
| default: |
| break; |
| case AArch64ISD::CSEL: { |
| KnownBits Known2; |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); |
| Known = KnownBits::commonBits(Known, Known2); |
| break; |
| } |
| case AArch64ISD::LOADgot: |
| case AArch64ISD::ADDlow: { |
| if (!Subtarget->isTargetILP32()) |
| break; |
| // In ILP32 mode all valid pointers are in the low 4GB of the address-space. |
| Known.Zero = APInt::getHighBitsSet(64, 32); |
| break; |
| } |
| case AArch64ISD::ASSERT_ZEXT_BOOL: { |
| Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); |
| Known.Zero |= APInt(Known.getBitWidth(), 0xFE); |
| break; |
| } |
| case ISD::INTRINSIC_W_CHAIN: { |
| ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); |
| Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); |
| switch (IntID) { |
| default: return; |
| case Intrinsic::aarch64_ldaxr: |
| case Intrinsic::aarch64_ldxr: { |
| unsigned BitWidth = Known.getBitWidth(); |
| EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); |
| unsigned MemBits = VT.getScalarSizeInBits(); |
| Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); |
| return; |
| } |
| } |
| break; |
| } |
| case ISD::INTRINSIC_WO_CHAIN: |
| case ISD::INTRINSIC_VOID: { |
| unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| switch (IntNo) { |
| default: |
| break; |
| case Intrinsic::aarch64_neon_umaxv: |
| case Intrinsic::aarch64_neon_uminv: { |
| // Figure out the datatype of the vector operand. The UMINV instruction |
| // will zero extend the result, so we can mark as known zero all the |
| // bits larger than the element datatype. 32-bit or larget doesn't need |
| // this as those are legal types and will be handled by isel directly. |
| MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); |
| unsigned BitWidth = Known.getBitWidth(); |
| if (VT == MVT::v8i8 || VT == MVT::v16i8) { |
| assert(BitWidth >= 8 && "Unexpected width!"); |
| APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); |
| Known.Zero |= Mask; |
| } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { |
| assert(BitWidth >= 16 && "Unexpected width!"); |
| APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); |
| Known.Zero |= Mask; |
| } |
| break; |
| } break; |
| } |
| } |
| } |
| } |
| |
| MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, |
| EVT) const { |
| return MVT::i64; |
| } |
| |
| bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( |
| EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| bool *Fast) const { |
| if (Subtarget->requiresStrictAlign()) |
| return false; |
| |
| if (Fast) { |
| // Some CPUs are fine with unaligned stores except for 128-bit ones. |
| *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || |
| // See comments in performSTORECombine() for more details about |
| // these conditions. |
| |
| // Code that uses clang vector extensions can mark that it |
| // wants unaligned accesses to be treated as fast by |
| // underspecifying alignment to be 1 or 2. |
| Alignment <= 2 || |
| |
| // Disregard v2i64. Memcpy lowering produces those and splitting |
| // them regresses performance on micro-benchmarks and olden/bh. |
| VT == MVT::v2i64; |
| } |
| return true; |
| } |
| |
| // Same as above but handling LLTs instead. |
| bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( |
| LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| bool *Fast) const { |
| if (Subtarget->requiresStrictAlign()) |
| return false; |
| |
| if (Fast) { |
| // Some CPUs are fine with unaligned stores except for 128-bit ones. |
| *Fast = !Subtarget->isMisaligned128StoreSlow() || |
| Ty.getSizeInBytes() != 16 || |
| // See comments in performSTORECombine() for more details about |
| // these conditions. |
| |
| // Code that uses clang vector extensions can mark that it |
| // wants unaligned accesses to be treated as fast by |
| // underspecifying alignment to be 1 or 2. |
| Alignment <= 2 || |
| |
| // Disregard v2i64. Memcpy lowering produces those and splitting |
| // them regresses performance on micro-benchmarks and olden/bh. |
| Ty == LLT::fixed_vector(2, 64); |
| } |
| return true; |
| } |
| |
| FastISel * |
| AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, |
| const TargetLibraryInfo *libInfo) const { |
| return AArch64::createFastISel(funcInfo, libInfo); |
| } |
| |
| const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { |
| #define MAKE_CASE(V) \ |
| case V: \ |
| return #V; |
| switch ((AArch64ISD::NodeType)Opcode) { |
| case AArch64ISD::FIRST_NUMBER: |
| break; |
| MAKE_CASE(AArch64ISD::CALL) |
| MAKE_CASE(AArch64ISD::ADRP) |
| MAKE_CASE(AArch64ISD::ADR) |
| MAKE_CASE(AArch64ISD::ADDlow) |
| MAKE_CASE(AArch64ISD::LOADgot) |
| MAKE_CASE(AArch64ISD::RET_FLAG) |
| MAKE_CASE(AArch64ISD::BRCOND) |
| MAKE_CASE(AArch64ISD::CSEL) |
| MAKE_CASE(AArch64ISD::CSINV) |
| MAKE_CASE(AArch64ISD::CSNEG) |
| MAKE_CASE(AArch64ISD::CSINC) |
| MAKE_CASE(AArch64ISD::THREAD_POINTER) |
| MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) |
| MAKE_CASE(AArch64ISD::ADD_PRED) |
| MAKE_CASE(AArch64ISD::MUL_PRED) |
| MAKE_CASE(AArch64ISD::MULHS_PRED) |
| MAKE_CASE(AArch64ISD::MULHU_PRED) |
| MAKE_CASE(AArch64ISD::SDIV_PRED) |
| MAKE_CASE(AArch64ISD::SHL_PRED) |
| MAKE_CASE(AArch64ISD::SMAX_PRED) |
| MAKE_CASE(AArch64ISD::SMIN_PRED) |
| MAKE_CASE(AArch64ISD::SRA_PRED) |
| MAKE_CASE(AArch64ISD::SRL_PRED) |
| MAKE_CASE(AArch64ISD::SUB_PRED) |
| MAKE_CASE(AArch64ISD::UDIV_PRED) |
| MAKE_CASE(AArch64ISD::UMAX_PRED) |
| MAKE_CASE(AArch64ISD::UMIN_PRED) |
| MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) |
| MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::ADC) |
| MAKE_CASE(AArch64ISD::SBC) |
| MAKE_CASE(AArch64ISD::ADDS) |
| MAKE_CASE(AArch64ISD::SUBS) |
| MAKE_CASE(AArch64ISD::ADCS) |
| MAKE_CASE(AArch64ISD::SBCS) |
| MAKE_CASE(AArch64ISD::ANDS) |
| MAKE_CASE(AArch64ISD::CCMP) |
| MAKE_CASE(AArch64ISD::CCMN) |
| MAKE_CASE(AArch64ISD::FCCMP) |
| MAKE_CASE(AArch64ISD::FCMP) |
| MAKE_CASE(AArch64ISD::STRICT_FCMP) |
| MAKE_CASE(AArch64ISD::STRICT_FCMPE) |
| MAKE_CASE(AArch64ISD::DUP) |
| MAKE_CASE(AArch64ISD::DUPLANE8) |
| MAKE_CASE(AArch64ISD::DUPLANE16) |
| MAKE_CASE(AArch64ISD::DUPLANE32) |
| MAKE_CASE(AArch64ISD::DUPLANE64) |
| MAKE_CASE(AArch64ISD::MOVI) |
| MAKE_CASE(AArch64ISD::MOVIshift) |
| MAKE_CASE(AArch64ISD::MOVIedit) |
| MAKE_CASE(AArch64ISD::MOVImsl) |
| MAKE_CASE(AArch64ISD::FMOV) |
| MAKE_CASE(AArch64ISD::MVNIshift) |
| MAKE_CASE(AArch64ISD::MVNImsl) |
| MAKE_CASE(AArch64ISD::BICi) |
| MAKE_CASE(AArch64ISD::ORRi) |
| MAKE_CASE(AArch64ISD::BSP) |
| MAKE_CASE(AArch64ISD::EXTR) |
| MAKE_CASE(AArch64ISD::ZIP1) |
| MAKE_CASE(AArch64ISD::ZIP2) |
| MAKE_CASE(AArch64ISD::UZP1) |
| MAKE_CASE(AArch64ISD::UZP2) |
| MAKE_CASE(AArch64ISD::TRN1) |
| MAKE_CASE(AArch64ISD::TRN2) |
| MAKE_CASE(AArch64ISD::REV16) |
| MAKE_CASE(AArch64ISD::REV32) |
| MAKE_CASE(AArch64ISD::REV64) |
| MAKE_CASE(AArch64ISD::EXT) |
| MAKE_CASE(AArch64ISD::SPLICE) |
| MAKE_CASE(AArch64ISD::VSHL) |
| MAKE_CASE(AArch64ISD::VLSHR) |
| MAKE_CASE(AArch64ISD::VASHR) |
| MAKE_CASE(AArch64ISD::VSLI) |
| MAKE_CASE(AArch64ISD::VSRI) |
| MAKE_CASE(AArch64ISD::CMEQ) |
| MAKE_CASE(AArch64ISD::CMGE) |
| MAKE_CASE(AArch64ISD::CMGT) |
| MAKE_CASE(AArch64ISD::CMHI) |
| MAKE_CASE(AArch64ISD::CMHS) |
| MAKE_CASE(AArch64ISD::FCMEQ) |
| MAKE_CASE(AArch64ISD::FCMGE) |
| MAKE_CASE(AArch64ISD::FCMGT) |
| MAKE_CASE(AArch64ISD::CMEQz) |
| MAKE_CASE(AArch64ISD::CMGEz) |
| MAKE_CASE(AArch64ISD::CMGTz) |
| MAKE_CASE(AArch64ISD::CMLEz) |
| MAKE_CASE(AArch64ISD::CMLTz) |
| MAKE_CASE(AArch64ISD::FCMEQz) |
| MAKE_CASE(AArch64ISD::FCMGEz) |
| MAKE_CASE(AArch64ISD::FCMGTz) |
| MAKE_CASE(AArch64ISD::FCMLEz) |
| MAKE_CASE(AArch64ISD::FCMLTz) |
| MAKE_CASE(AArch64ISD::SADDV) |
| MAKE_CASE(AArch64ISD::UADDV) |
| MAKE_CASE(AArch64ISD::SRHADD) |
| MAKE_CASE(AArch64ISD::URHADD) |
| MAKE_CASE(AArch64ISD::SHADD) |
| MAKE_CASE(AArch64ISD::UHADD) |
| MAKE_CASE(AArch64ISD::SDOT) |
| MAKE_CASE(AArch64ISD::UDOT) |
| MAKE_CASE(AArch64ISD::SMINV) |
| MAKE_CASE(AArch64ISD::UMINV) |
| MAKE_CASE(AArch64ISD::SMAXV) |
| MAKE_CASE(AArch64ISD::UMAXV) |
| MAKE_CASE(AArch64ISD::SADDV_PRED) |
| MAKE_CASE(AArch64ISD::UADDV_PRED) |
| MAKE_CASE(AArch64ISD::SMAXV_PRED) |
| MAKE_CASE(AArch64ISD::UMAXV_PRED) |
| MAKE_CASE(AArch64ISD::SMINV_PRED) |
| MAKE_CASE(AArch64ISD::UMINV_PRED) |
| MAKE_CASE(AArch64ISD::ORV_PRED) |
| MAKE_CASE(AArch64ISD::EORV_PRED) |
| MAKE_CASE(AArch64ISD::ANDV_PRED) |
| MAKE_CASE(AArch64ISD::CLASTA_N) |
| MAKE_CASE(AArch64ISD::CLASTB_N) |
| MAKE_CASE(AArch64ISD::LASTA) |
| MAKE_CASE(AArch64ISD::LASTB) |
| MAKE_CASE(AArch64ISD::REINTERPRET_CAST) |
| MAKE_CASE(AArch64ISD::LS64_BUILD) |
| MAKE_CASE(AArch64ISD::LS64_EXTRACT) |
| MAKE_CASE(AArch64ISD::TBL) |
| MAKE_CASE(AArch64ISD::FADD_PRED) |
| MAKE_CASE(AArch64ISD::FADDA_PRED) |
| MAKE_CASE(AArch64ISD::FADDV_PRED) |
| MAKE_CASE(AArch64ISD::FDIV_PRED) |
| MAKE_CASE(AArch64ISD::FMA_PRED) |
| MAKE_CASE(AArch64ISD::FMAX_PRED) |
| MAKE_CASE(AArch64ISD::FMAXV_PRED) |
| MAKE_CASE(AArch64ISD::FMAXNM_PRED) |
| MAKE_CASE(AArch64ISD::FMAXNMV_PRED) |
| MAKE_CASE(AArch64ISD::FMIN_PRED) |
| MAKE_CASE(AArch64ISD::FMINV_PRED) |
| MAKE_CASE(AArch64ISD::FMINNM_PRED) |
| MAKE_CASE(AArch64ISD::FMINNMV_PRED) |
| MAKE_CASE(AArch64ISD::FMUL_PRED) |
| MAKE_CASE(AArch64ISD::FSUB_PRED) |
| MAKE_CASE(AArch64ISD::BIC) |
| MAKE_CASE(AArch64ISD::BIT) |
| MAKE_CASE(AArch64ISD::CBZ) |
| MAKE_CASE(AArch64ISD::CBNZ) |
| MAKE_CASE(AArch64ISD::TBZ) |
| MAKE_CASE(AArch64ISD::TBNZ) |
| MAKE_CASE(AArch64ISD::TC_RETURN) |
| MAKE_CASE(AArch64ISD::PREFETCH) |
| MAKE_CASE(AArch64ISD::SITOF) |
| MAKE_CASE(AArch64ISD::UITOF) |
| MAKE_CASE(AArch64ISD::NVCAST) |
| MAKE_CASE(AArch64ISD::MRS) |
| MAKE_CASE(AArch64ISD::SQSHL_I) |
| MAKE_CASE(AArch64ISD::UQSHL_I) |
| MAKE_CASE(AArch64ISD::SRSHR_I) |
| MAKE_CASE(AArch64ISD::URSHR_I) |
| MAKE_CASE(AArch64ISD::SQSHLU_I) |
| MAKE_CASE(AArch64ISD::WrapperLarge) |
| MAKE_CASE(AArch64ISD::LD2post) |
| MAKE_CASE(AArch64ISD::LD3post) |
| MAKE_CASE(AArch64ISD::LD4post) |
| MAKE_CASE(AArch64ISD::ST2post) |
| MAKE_CASE(AArch64ISD::ST3post) |
| MAKE_CASE(AArch64ISD::ST4post) |
| MAKE_CASE(AArch64ISD::LD1x2post) |
| MAKE_CASE(AArch64ISD::LD1x3post) |
| MAKE_CASE(AArch64ISD::LD1x4post) |
| MAKE_CASE(AArch64ISD::ST1x2post) |
| MAKE_CASE(AArch64ISD::ST1x3post) |
| MAKE_CASE(AArch64ISD::ST1x4post) |
| MAKE_CASE(AArch64ISD::LD1DUPpost) |
| MAKE_CASE(AArch64ISD::LD2DUPpost) |
| MAKE_CASE(AArch64ISD::LD3DUPpost) |
| MAKE_CASE(AArch64ISD::LD4DUPpost) |
| MAKE_CASE(AArch64ISD::LD1LANEpost) |
| MAKE_CASE(AArch64ISD::LD2LANEpost) |
| MAKE_CASE(AArch64ISD::LD3LANEpost) |
| MAKE_CASE(AArch64ISD::LD4LANEpost) |
| MAKE_CASE(AArch64ISD::ST2LANEpost) |
| MAKE_CASE(AArch64ISD::ST3LANEpost) |
| MAKE_CASE(AArch64ISD::ST4LANEpost) |
| MAKE_CASE(AArch64ISD::SMULL) |
| MAKE_CASE(AArch64ISD::UMULL) |
| MAKE_CASE(AArch64ISD::FRECPE) |
| MAKE_CASE(AArch64ISD::FRECPS) |
| MAKE_CASE(AArch64ISD::FRSQRTE) |
| MAKE_CASE(AArch64ISD::FRSQRTS) |
| MAKE_CASE(AArch64ISD::STG) |
| MAKE_CASE(AArch64ISD::STZG) |
| MAKE_CASE(AArch64ISD::ST2G) |
| MAKE_CASE(AArch64ISD::STZ2G) |
| MAKE_CASE(AArch64ISD::SUNPKHI) |
| MAKE_CASE(AArch64ISD::SUNPKLO) |
| MAKE_CASE(AArch64ISD::UUNPKHI) |
| MAKE_CASE(AArch64ISD::UUNPKLO) |
| MAKE_CASE(AArch64ISD::INSR) |
| MAKE_CASE(AArch64ISD::PTEST) |
| MAKE_CASE(AArch64ISD::PTRUE) |
| MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) |
| MAKE_CASE(AArch64ISD::ST1_PRED) |
| MAKE_CASE(AArch64ISD::SST1_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) |
| MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) |
| MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) |
| MAKE_CASE(AArch64ISD::SST1_IMM_PRED) |
| MAKE_CASE(AArch64ISD::SSTNT1_PRED) |
| MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) |
| MAKE_CASE(AArch64ISD::LDP) |
| MAKE_CASE(AArch64ISD::STP) |
| MAKE_CASE(AArch64ISD::STNP) |
| MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) |
| MAKE_CASE(AArch64ISD::INDEX_VECTOR) |
| MAKE_CASE(AArch64ISD::UADDLP) |
| MAKE_CASE(AArch64ISD::CALL_RVMARKER) |
| MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) |
| } |
| #undef MAKE_CASE |
| return nullptr; |
| } |
| |
| MachineBasicBlock * |
| AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| // We materialise the F128CSEL pseudo-instruction as some control flow and a |
| // phi node: |
| |
| // OrigBB: |
| // [... previous instrs leading to comparison ...] |
| // b.ne TrueBB |
| // b EndBB |
| // TrueBB: |
| // ; Fallthrough |
| // EndBB: |
| // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] |
| |
| MachineFunction *MF = MBB->getParent(); |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| const BasicBlock *LLVM_BB = MBB->getBasicBlock(); |
| DebugLoc DL = MI.getDebugLoc(); |
| MachineFunction::iterator It = ++MBB->getIterator(); |
| |
| Register DestReg = MI.getOperand(0).getReg(); |
| Register IfTrueReg = MI.getOperand(1).getReg(); |
| Register IfFalseReg = MI.getOperand(2).getReg(); |
| unsigned CondCode = MI.getOperand(3).getImm(); |
| bool NZCVKilled = MI.getOperand(4).isKill(); |
| |
| MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); |
| MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); |
| MF->insert(It, TrueBB); |
| MF->insert(It, EndBB); |
| |
| // Transfer rest of current basic-block to EndBB |
| EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), |
| MBB->end()); |
| EndBB->transferSuccessorsAndUpdatePHIs(MBB); |
| |
| BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); |
| BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); |
| MBB->addSuccessor(TrueBB); |
| MBB->addSuccessor(EndBB); |
| |
| // TrueBB falls through to the end. |
| TrueBB->addSuccessor(EndBB); |
| |
| if (!NZCVKilled) { |
| TrueBB->addLiveIn(AArch64::NZCV); |
| EndBB->addLiveIn(AArch64::NZCV); |
| } |
| |
| BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) |
| .addReg(IfTrueReg) |
| .addMBB(TrueBB) |
| .addReg(IfFalseReg) |
| .addMBB(MBB); |
| |
| MI.eraseFromParent(); |
| return EndBB; |
| } |
| |
| MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( |
| MachineInstr &MI, MachineBasicBlock *BB) const { |
| assert(!isAsynchronousEHPersonality(classifyEHPersonality( |
| BB->getParent()->getFunction().getPersonalityFn())) && |
| "SEH does not use catchret!"); |
| return BB; |
| } |
| |
| MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( |
| MachineInstr &MI, MachineBasicBlock *BB) const { |
| switch (MI.getOpcode()) { |
| default: |
| #ifndef NDEBUG |
| MI.dump(); |
| #endif |
| llvm_unreachable("Unexpected instruction for custom inserter!"); |
| |
| case AArch64::F128CSEL: |
| return EmitF128CSEL(MI, BB); |
| |
| case TargetOpcode::STATEPOINT: |
| // STATEPOINT is a pseudo instruction which has no implicit defs/uses |
| // while bl call instruction (where statepoint will be lowered at the end) |
| // has implicit def. Add this implicit dead def here as a workaround. |
| MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true, |
| true, false, true)); |
| LLVM_FALLTHROUGH; |
| case TargetOpcode::STACKMAP: |
| case TargetOpcode::PATCHPOINT: |
| return emitPatchPoint(MI, BB); |
| |
| case AArch64::CATCHRET: |
| return EmitLoweredCatchRet(MI, BB); |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Lowering private implementation. |
| //===----------------------------------------------------------------------===// |
| |
| //===----------------------------------------------------------------------===// |
| // Lowering Code |
| //===----------------------------------------------------------------------===// |
| |
| // Forward declarations of SVE fixed length lowering helpers |
| static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); |
| static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); |
| static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); |
| static SDValue convertFixedMaskToScalableVector(SDValue Mask, |
| SelectionDAG &DAG); |
| static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, |
| EVT VT); |
| |
| /// isZerosVector - Check whether SDNode N is a zero-filled vector. |
| static bool isZerosVector(const SDNode *N) { |
| // Look through a bit convert. |
| while (N->getOpcode() == ISD::BITCAST) |
| N = N->getOperand(0).getNode(); |
| |
| if (ISD::isConstantSplatVectorAllZeros(N)) |
| return true; |
| |
| if (N->getOpcode() != AArch64ISD::DUP) |
| return false; |
| |
| auto Opnd0 = N->getOperand(0); |
| auto *CINT = dyn_cast<ConstantSDNode>(Opnd0); |
| auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0); |
| return (CINT && CINT->isZero()) || (CFP && CFP->isZero()); |
| } |
| |
| /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 |
| /// CC |
| static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { |
| switch (CC) { |
| default: |
| llvm_unreachable("Unknown condition code!"); |
| case ISD::SETNE: |
| return AArch64CC::NE; |
| case ISD::SETEQ: |
| return AArch64CC::EQ; |
| case ISD::SETGT: |
| return AArch64CC::GT; |
| case ISD::SETGE: |
| return AArch64CC::GE; |
| case ISD::SETLT: |
| return AArch64CC::LT; |
| case ISD::SETLE: |
| return AArch64CC::LE; |
| case ISD::SETUGT: |
| return AArch64CC::HI; |
| case ISD::SETUGE: |
| return AArch64CC::HS; |
| case ISD::SETULT: |
| return AArch64CC::LO; |
| case ISD::SETULE: |
| return AArch64CC::LS; |
| } |
| } |
| |
| /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. |
| static void changeFPCCToAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2) { |
| CondCode2 = AArch64CC::AL; |
| switch (CC) { |
| default: |
| llvm_unreachable("Unknown FP condition!"); |
| case ISD::SETEQ: |
| case ISD::SETOEQ: |
| CondCode = AArch64CC::EQ; |
| break; |
| case ISD::SETGT: |
| case ISD::SETOGT: |
| CondCode = AArch64CC::GT; |
| break; |
| case ISD::SETGE: |
| case ISD::SETOGE: |
| CondCode = AArch64CC::GE; |
| break; |
| case ISD::SETOLT: |
| CondCode = AArch64CC::MI; |
| break; |
| case ISD::SETOLE: |
| CondCode = AArch64CC::LS; |
| break; |
| case ISD::SETONE: |
| CondCode = AArch64CC::MI; |
| CondCode2 = AArch64CC::GT; |
| break; |
| case ISD::SETO: |
| CondCode = AArch64CC::VC; |
| break; |
| case ISD::SETUO: |
| CondCode = AArch64CC::VS; |
| break; |
| case ISD::SETUEQ: |
| CondCode = AArch64CC::EQ; |
| CondCode2 = AArch64CC::VS; |
| break; |
| case ISD::SETUGT: |
| CondCode = AArch64CC::HI; |
| break; |
| case ISD::SETUGE: |
| CondCode = AArch64CC::PL; |
| break; |
| case ISD::SETLT: |
| case ISD::SETULT: |
| CondCode = AArch64CC::LT; |
| break; |
| case ISD::SETLE: |
| case ISD::SETULE: |
| CondCode = AArch64CC::LE; |
| break; |
| case ISD::SETNE: |
| case ISD::SETUNE: |
| CondCode = AArch64CC::NE; |
| break; |
| } |
| } |
| |
| /// Convert a DAG fp condition code to an AArch64 CC. |
| /// This differs from changeFPCCToAArch64CC in that it returns cond codes that |
| /// should be AND'ed instead of OR'ed. |
| static void changeFPCCToANDAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2) { |
| CondCode2 = AArch64CC::AL; |
| switch (CC) { |
| default: |
| changeFPCCToAArch64CC(CC, CondCode, CondCode2); |
| assert(CondCode2 == AArch64CC::AL); |
| break; |
| case ISD::SETONE: |
| // (a one b) |
| // == ((a olt b) || (a ogt b)) |
| // == ((a ord b) && (a une b)) |
| CondCode = AArch64CC::VC; |
| CondCode2 = AArch64CC::NE; |
| break; |
| case ISD::SETUEQ: |
| // (a ueq b) |
| // == ((a uno b) || (a oeq b)) |
| // == ((a ule b) && (a uge b)) |
| CondCode = AArch64CC::PL; |
| CondCode2 = AArch64CC::LE; |
| break; |
| } |
| } |
| |
| /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 |
| /// CC usable with the vector instructions. Fewer operations are available |
| /// without a real NZCV register, so we have to use less efficient combinations |
| /// to get the same effect. |
| static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, |
| AArch64CC::CondCode &CondCode, |
| AArch64CC::CondCode &CondCode2, |
| bool &Invert) { |
| Invert = false; |
| switch (CC) { |
| default: |
| // Mostly the scalar mappings work fine. |
| changeFPCCToAArch64CC(CC, CondCode, CondCode2); |
| break; |
| case ISD::SETUO: |
| Invert = true; |
| LLVM_FALLTHROUGH; |
| case ISD::SETO: |
| CondCode = AArch64CC::MI; |
| CondCode2 = AArch64CC::GE; |
| break; |
| case ISD::SETUEQ: |
| case ISD::SETULT: |
| case ISD::SETULE: |
| case ISD::SETUGT: |
| case ISD::SETUGE: |
| // All of the compare-mask comparisons are ordered, but we can switch |
| // between the two by a double inversion. E.g. ULE == !OGT. |
| Invert = true; |
| changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), |
| CondCode, CondCode2); |
| break; |
| } |
| } |
| |
| static bool isLegalArithImmed(uint64_t C) { |
| // Matches AArch64DAGToDAGISel::SelectArithImmed(). |
| bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); |
| LLVM_DEBUG(dbgs() << "Is imm " << C |
| << " legal: " << (IsLegal ? "yes\n" : "no\n")); |
| return IsLegal; |
| } |
| |
| // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on |
| // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags |
| // can be set differently by this operation. It comes down to whether |
| // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then |
| // everything is fine. If not then the optimization is wrong. Thus general |
| // comparisons are only valid if op2 != 0. |
| // |
| // So, finally, the only LLVM-native comparisons that don't mention C and V |
| // are SETEQ and SETNE. They're the only ones we can safely use CMN for in |
| // the absence of information about op2. |
| static bool isCMN(SDValue Op, ISD::CondCode CC) { |
| return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && |
| (CC == ISD::SETEQ || CC == ISD::SETNE); |
| } |
| |
| static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, |
| SelectionDAG &DAG, SDValue Chain, |
| bool IsSignaling) { |
| EVT VT = LHS.getValueType(); |
| assert(VT != MVT::f128); |
| assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); |
| unsigned Opcode = |
| IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; |
| return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); |
| } |
| |
| static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
| const SDLoc &dl, SelectionDAG &DAG) { |
| EVT VT = LHS.getValueType(); |
| const bool FullFP16 = |
| static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); |
| |
| if (VT.isFloatingPoint()) { |
| assert(VT != MVT::f128); |
| if (VT == MVT::f16 && !FullFP16) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); |
| VT = MVT::f32; |
| } |
| return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); |
| } |
| |
| // The CMP instruction is just an alias for SUBS, and representing it as |
| // SUBS means that it's possible to get CSE with subtract operations. |
| // A later phase can perform the optimization of setting the destination |
| // register to WZR/XZR if it ends up being unused. |
| unsigned Opcode = AArch64ISD::SUBS; |
| |
| if (isCMN(RHS, CC)) { |
| // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? |
| Opcode = AArch64ISD::ADDS; |
| RHS = RHS.getOperand(1); |
| } else if (isCMN(LHS, CC)) { |
| // As we are looking for EQ/NE compares, the operands can be commuted ; can |
| // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? |
| Opcode = AArch64ISD::ADDS; |
| LHS = LHS.getOperand(1); |
| } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { |
| if (LHS.getOpcode() == ISD::AND) { |
| // Similarly, (CMP (and X, Y), 0) can be implemented with a TST |
| // (a.k.a. ANDS) except that the flags are only guaranteed to work for one |
| // of the signed comparisons. |
| const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, |
| DAG.getVTList(VT, MVT_CC), |
| LHS.getOperand(0), |
| LHS.getOperand(1)); |
| // Replace all users of (and X, Y) with newly generated (ands X, Y) |
| DAG.ReplaceAllUsesWith(LHS, ANDSNode); |
| return ANDSNode.getValue(1); |
| } else if (LHS.getOpcode() == AArch64ISD::ANDS) { |
| // Use result of ANDS |
| return LHS.getValue(1); |
| } |
| } |
| |
| return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) |
| .getValue(1); |
| } |
| |
| /// \defgroup AArch64CCMP CMP;CCMP matching |
| /// |
| /// These functions deal with the formation of CMP;CCMP;... sequences. |
| /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of |
| /// a comparison. They set the NZCV flags to a predefined value if their |
| /// predicate is false. This allows to express arbitrary conjunctions, for |
| /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" |
| /// expressed as: |
| /// cmp A |
| /// ccmp B, inv(CB), CA |
| /// check for CB flags |
| /// |
| /// This naturally lets us implement chains of AND operations with SETCC |
| /// operands. And we can even implement some other situations by transforming |
| /// them: |
| /// - We can implement (NEG SETCC) i.e. negating a single comparison by |
| /// negating the flags used in a CCMP/FCCMP operations. |
| /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations |
| /// by negating the flags we test for afterwards. i.e. |
| /// NEG (CMP CCMP CCCMP ...) can be implemented. |
| /// - Note that we can only ever negate all previously processed results. |
| /// What we can not implement by flipping the flags to test is a negation |
| /// of two sub-trees (because the negation affects all sub-trees emitted so |
| /// far, so the 2nd sub-tree we emit would also affect the first). |
| /// With those tools we can implement some OR operations: |
| /// - (OR (SETCC A) (SETCC B)) can be implemented via: |
| /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) |
| /// - After transforming OR to NEG/AND combinations we may be able to use NEG |
| /// elimination rules from earlier to implement the whole thing as a |
| /// CCMP/FCCMP chain. |
| /// |
| /// As complete example: |
| /// or (or (setCA (cmp A)) (setCB (cmp B))) |
| /// (and (setCC (cmp C)) (setCD (cmp D)))" |
| /// can be reassociated to: |
| /// or (and (setCC (cmp C)) setCD (cmp D)) |
| // (or (setCA (cmp A)) (setCB (cmp B))) |
| /// can be transformed to: |
| /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) |
| /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" |
| /// which can be implemented as: |
| /// cmp C |
| /// ccmp D, inv(CD), CC |
| /// ccmp A, CA, inv(CD) |
| /// ccmp B, CB, inv(CA) |
| /// check for CB flags |
| /// |
| /// A counterexample is "or (and A B) (and C D)" which translates to |
| /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we |
| /// can only implement 1 of the inner (not) operations, but not both! |
| /// @{ |
| |
| /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. |
| static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, |
| ISD::CondCode CC, SDValue CCOp, |
| AArch64CC::CondCode Predicate, |
| AArch64CC::CondCode OutCC, |
| const SDLoc &DL, SelectionDAG &DAG) { |
| unsigned Opcode = 0; |
| const bool FullFP16 = |
| static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); |
| |
| if (LHS.getValueType().isFloatingPoint()) { |
| assert(LHS.getValueType() != MVT::f128); |
| if (LHS.getValueType() == MVT::f16 && !FullFP16) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); |
| } |
| Opcode = AArch64ISD::FCCMP; |
| } else if (RHS.getOpcode() == ISD::SUB) { |
| SDValue SubOp0 = RHS.getOperand(0); |
| if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { |
| // See emitComparison() on why we can only do this for SETEQ and SETNE. |
| Opcode = AArch64ISD::CCMN; |
| RHS = RHS.getOperand(1); |
| } |
| } |
| if (Opcode == 0) |
| Opcode = AArch64ISD::CCMP; |
| |
| SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); |
| AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); |
| unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); |
| SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); |
| return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); |
| } |
| |
| /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be |
| /// expressed as a conjunction. See \ref AArch64CCMP. |
| /// \param CanNegate Set to true if we can negate the whole sub-tree just by |
| /// changing the conditions on the SETCC tests. |
| /// (this means we can call emitConjunctionRec() with |
| /// Negate==true on this sub-tree) |
| /// \param MustBeFirst Set to true if this subtree needs to be negated and we |
| /// cannot do the negation naturally. We are required to |
| /// emit the subtree first in this case. |
| /// \param WillNegate Is true if are called when the result of this |
| /// subexpression must be negated. This happens when the |
| /// outer expression is an OR. We can use this fact to know |
| /// that we have a double negation (or (or ...) ...) that |
| /// can be implemented for free. |
| static bool canEmitConjunction(const SDValue Val, bool &CanNegate, |
| bool &MustBeFirst, bool WillNegate, |
| unsigned Depth = 0) { |
| if (!Val.hasOneUse()) |
| return false; |
| unsigned Opcode = Val->getOpcode(); |
| if (Opcode == ISD::SETCC) { |
| if (Val->getOperand(0).getValueType() == MVT::f128) |
| return false; |
| CanNegate = true; |
| MustBeFirst = false; |
| return true; |
| } |
| // Protect against exponential runtime and stack overflow. |
| if (Depth > 6) |
| return false; |
| if (Opcode == ISD::AND || Opcode == ISD::OR) { |
| bool IsOR = Opcode == ISD::OR; |
| SDValue O0 = Val->getOperand(0); |
| SDValue O1 = Val->getOperand(1); |
| bool CanNegateL; |
| bool MustBeFirstL; |
| if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) |
| return false; |
| bool CanNegateR; |
| bool MustBeFirstR; |
| if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) |
| return false; |
| |
| if (MustBeFirstL && MustBeFirstR) |
| return false; |
| |
| if (IsOR) { |
| // For an OR expression we need to be able to naturally negate at least |
| // one side or we cannot do the transformation at all. |
| if (!CanNegateL && !CanNegateR) |
| return false; |
| // If we the result of the OR will be negated and we can naturally negate |
| // the leafs, then this sub-tree as a whole negates naturally. |
| CanNegate = WillNegate && CanNegateL && CanNegateR; |
| // If we cannot naturally negate the whole sub-tree, then this must be |
| // emitted first. |
| MustBeFirst = !CanNegate; |
| } else { |
| assert(Opcode == ISD::AND && "Must be OR or AND"); |
| // We cannot naturally negate an AND operation. |
| CanNegate = false; |
| MustBeFirst = MustBeFirstL || MustBeFirstR; |
| } |
| return true; |
| } |
| return false; |
| } |
| |
| /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain |
| /// of CCMP/CFCMP ops. See @ref AArch64CCMP. |
| /// Tries to transform the given i1 producing node @p Val to a series compare |
| /// and conditional compare operations. @returns an NZCV flags producing node |
| /// and sets @p OutCC to the flags that should be tested or returns SDValue() if |
| /// transformation was not possible. |
| /// \p Negate is true if we want this sub-tree being negated just by changing |
| /// SETCC conditions. |
| static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, |
| AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, |
| AArch64CC::CondCode Predicate) { |
| // We're at a tree leaf, produce a conditional comparison operation. |
| unsigned Opcode = Val->getOpcode(); |
| if (Opcode == ISD::SETCC) { |
| SDValue LHS = Val->getOperand(0); |
| SDValue RHS = Val->getOperand(1); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); |
| bool isInteger = LHS.getValueType().isInteger(); |
| if (Negate) |
| CC = getSetCCInverse(CC, LHS.getValueType()); |
| SDLoc DL(Val); |
| // Determine OutCC and handle FP special case. |
| if (isInteger) { |
| OutCC = changeIntCCToAArch64CC(CC); |
| } else { |
| assert(LHS.getValueType().isFloatingPoint()); |
| AArch64CC::CondCode ExtraCC; |
| changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); |
| // Some floating point conditions can't be tested with a single condition |
| // code. Construct an additional comparison in this case. |
| if (ExtraCC != AArch64CC::AL) { |
| SDValue ExtraCmp; |
| if (!CCOp.getNode()) |
| ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); |
| else |
| ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, |
| ExtraCC, DL, DAG); |
| CCOp = ExtraCmp; |
| Predicate = ExtraCC; |
| } |
| } |
| |
| // Produce a normal comparison if we are first in the chain |
| if (!CCOp) |
| return emitComparison(LHS, RHS, CC, DL, DAG); |
| // Otherwise produce a ccmp. |
| return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, |
| DAG); |
| } |
| assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); |
| |
| bool IsOR = Opcode == ISD::OR; |
| |
| SDValue LHS = Val->getOperand(0); |
| bool CanNegateL; |
| bool MustBeFirstL; |
| bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); |
| assert(ValidL && "Valid conjunction/disjunction tree"); |
| (void)ValidL; |
| |
| SDValue RHS = Val->getOperand(1); |
| bool CanNegateR; |
| bool MustBeFirstR; |
| bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); |
| assert(ValidR && "Valid conjunction/disjunction tree"); |
| (void)ValidR; |
| |
| // Swap sub-tree that must come first to the right side. |
| if (MustBeFirstL) { |
| assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); |
| std::swap(LHS, RHS); |
| std::swap(CanNegateL, CanNegateR); |
| std::swap(MustBeFirstL, MustBeFirstR); |
| } |
| |
| bool NegateR; |
| bool NegateAfterR; |
| bool NegateL; |
| bool NegateAfterAll; |
| if (Opcode == ISD::OR) { |
| // Swap the sub-tree that we can negate naturally to the left. |
| if (!CanNegateL) { |
| assert(CanNegateR && "at least one side must be negatable"); |
| assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); |
| assert(!Negate); |
| std::swap(LHS, RHS); |
| NegateR = false; |
| NegateAfterR = true; |
| } else { |
| // Negate the left sub-tree if possible, otherwise negate the result. |
| NegateR = CanNegateR; |
| NegateAfterR = !CanNegateR; |
| } |
| NegateL = true; |
| NegateAfterAll = !Negate; |
| } else { |
| assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); |
| assert(!Negate && "Valid conjunction/disjunction tree"); |
| |
| NegateL = false; |
| NegateR = false; |
| NegateAfterR = false; |
| NegateAfterAll = false; |
| } |
| |
| // Emit sub-trees. |
| AArch64CC::CondCode RHSCC; |
| SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); |
| if (NegateAfterR) |
| RHSCC = AArch64CC::getInvertedCondCode(RHSCC); |
| SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); |
| if (NegateAfterAll) |
| OutCC = AArch64CC::getInvertedCondCode(OutCC); |
| return CmpL; |
| } |
| |
| /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). |
| /// In some cases this is even possible with OR operations in the expression. |
| /// See \ref AArch64CCMP. |
| /// \see emitConjunctionRec(). |
| static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, |
| AArch64CC::CondCode &OutCC) { |
| bool DummyCanNegate; |
| bool DummyMustBeFirst; |
| if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) |
| return SDValue(); |
| |
| return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); |
| } |
| |
| /// @} |
| |
| /// Returns how profitable it is to fold a comparison's operand's shift and/or |
| /// extension operations. |
| static unsigned getCmpOperandFoldingProfit(SDValue Op) { |
| auto isSupportedExtend = [&](SDValue V) { |
| if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) |
| return true; |
| |
| if (V.getOpcode() == ISD::AND) |
| if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { |
| uint64_t Mask = MaskCst->getZExtValue(); |
| return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); |
| } |
| |
| return false; |
| }; |
| |
| if (!Op.hasOneUse()) |
| return 0; |
| |
| if (isSupportedExtend(Op)) |
| return 1; |
| |
| unsigned Opc = Op.getOpcode(); |
| if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) |
| if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { |
| uint64_t Shift = ShiftCst->getZExtValue(); |
| if (isSupportedExtend(Op.getOperand(0))) |
| return (Shift <= 4) ? 2 : 1; |
| EVT VT = Op.getValueType(); |
| if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
| SDValue &AArch64cc, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { |
| EVT VT = RHS.getValueType(); |
| uint64_t C = RHSC->getZExtValue(); |
| if (!isLegalArithImmed(C)) { |
| // Constant does not fit, try adjusting it by one? |
| switch (CC) { |
| default: |
| break; |
| case ISD::SETLT: |
| case ISD::SETGE: |
| if ((VT == MVT::i32 && C != 0x80000000 && |
| isLegalArithImmed((uint32_t)(C - 1))) || |
| (VT == MVT::i64 && C != 0x80000000ULL && |
| isLegalArithImmed(C - 1ULL))) { |
| CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; |
| C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETULT: |
| case ISD::SETUGE: |
| if ((VT == MVT::i32 && C != 0 && |
| isLegalArithImmed((uint32_t)(C - 1))) || |
| (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { |
| CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; |
| C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETLE: |
| case ISD::SETGT: |
| if ((VT == MVT::i32 && C != INT32_MAX && |
| isLegalArithImmed((uint32_t)(C + 1))) || |
| (VT == MVT::i64 && C != INT64_MAX && |
| isLegalArithImmed(C + 1ULL))) { |
| CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; |
| C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| case ISD::SETULE: |
| case ISD::SETUGT: |
| if ((VT == MVT::i32 && C != UINT32_MAX && |
| isLegalArithImmed((uint32_t)(C + 1))) || |
| (VT == MVT::i64 && C != UINT64_MAX && |
| isLegalArithImmed(C + 1ULL))) { |
| CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; |
| C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; |
| RHS = DAG.getConstant(C, dl, VT); |
| } |
| break; |
| } |
| } |
| } |
| |
| // Comparisons are canonicalized so that the RHS operand is simpler than the |
| // LHS one, the extreme case being when RHS is an immediate. However, AArch64 |
| // can fold some shift+extend operations on the RHS operand, so swap the |
| // operands if that can be done. |
| // |
| // For example: |
| // lsl w13, w11, #1 |
| // cmp w13, w12 |
| // can be turned into: |
| // cmp w12, w11, lsl #1 |
| if (!isa<ConstantSDNode>(RHS) || |
| !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { |
| SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; |
| |
| if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { |
| std::swap(LHS, RHS); |
| CC = ISD::getSetCCSwappedOperands(CC); |
| } |
| } |
| |
| SDValue Cmp; |
| AArch64CC::CondCode AArch64CC; |
| if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { |
| const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); |
| |
| // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. |
| // For the i8 operand, the largest immediate is 255, so this can be easily |
| // encoded in the compare instruction. For the i16 operand, however, the |
| // largest immediate cannot be encoded in the compare. |
| // Therefore, use a sign extending load and cmn to avoid materializing the |
| // -1 constant. For example, |
| // movz w1, #65535 |
| // ldrh w0, [x0, #0] |
| // cmp w0, w1 |
| // > |
| // ldrsh w0, [x0, #0] |
| // cmn w0, #1 |
| // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) |
| // if and only if (sext LHS) == (sext RHS). The checks are in place to |
| // ensure both the LHS and RHS are truly zero extended and to make sure the |
| // transformation is profitable. |
| if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && |
| cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && |
| cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && |
| LHS.getNode()->hasNUsesOfValue(1, 0)) { |
| int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); |
| if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { |
| SDValue SExt = |
| DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, |
| DAG.getValueType(MVT::i16)); |
| Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, |
| RHS.getValueType()), |
| CC, dl, DAG); |
| AArch64CC = changeIntCCToAArch64CC(CC); |
| } |
| } |
| |
| if (!Cmp && (RHSC->isZero() || RHSC->isOne())) { |
| if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { |
| if ((CC == ISD::SETNE) ^ RHSC->isZero()) |
| AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); |
| } |
| } |
| } |
| |
| if (!Cmp) { |
| Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| AArch64CC = changeIntCCToAArch64CC(CC); |
| } |
| AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); |
| return Cmp; |
| } |
| |
| static std::pair<SDValue, SDValue> |
| getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { |
| assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && |
| "Unsupported value type"); |
| SDValue Value, Overflow; |
| SDLoc DL(Op); |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| unsigned Opc = 0; |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("Unknown overflow instruction!"); |
| case ISD::SADDO: |
| Opc = AArch64ISD::ADDS; |
| CC = AArch64CC::VS; |
| break; |
| case ISD::UADDO: |
| Opc = AArch64ISD::ADDS; |
| CC = AArch64CC::HS; |
| break; |
| case ISD::SSUBO: |
| Opc = AArch64ISD::SUBS; |
| CC = AArch64CC::VS; |
| break; |
| case ISD::USUBO: |
| Opc = AArch64ISD::SUBS; |
| CC = AArch64CC::LO; |
| break; |
| // Multiply needs a little bit extra work. |
| case ISD::SMULO: |
| case ISD::UMULO: { |
| CC = AArch64CC::NE; |
| bool IsSigned = Op.getOpcode() == ISD::SMULO; |
| if (Op.getValueType() == MVT::i32) { |
| // Extend to 64-bits, then perform a 64-bit multiply. |
| unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); |
| RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); |
| SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); |
| Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); |
| |
| // Check that the result fits into a 32-bit integer. |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); |
| if (IsSigned) { |
| // cmp xreg, wreg, sxtw |
| SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); |
| Overflow = |
| DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); |
| } else { |
| // tst xreg, #0xffffffff00000000 |
| SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); |
| Overflow = |
| DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); |
| } |
| break; |
| } |
| assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); |
| // For the 64 bit multiply |
| Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); |
| if (IsSigned) { |
| SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); |
| SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, |
| DAG.getConstant(63, DL, MVT::i64)); |
| // It is important that LowerBits is last, otherwise the arithmetic |
| // shift will not be folded into the compare (SUBS). |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); |
| Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) |
| .getValue(1); |
| } else { |
| SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); |
| SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); |
| Overflow = |
| DAG.getNode(AArch64ISD::SUBS, DL, VTs, |
| DAG.getConstant(0, DL, MVT::i64), |
| UpperBits).getValue(1); |
| } |
| break; |
| } |
| } // switch (...) |
| |
| if (Opc) { |
| SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); |
| |
| // Emit the AArch64 operation with overflow check. |
| Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); |
| Overflow = Value.getValue(1); |
| } |
| return std::make_pair(Value, Overflow); |
| } |
| |
| SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { |
| if (useSVEForFixedLengthVectorVT(Op.getValueType())) |
| return LowerToScalableOp(Op, DAG); |
| |
| SDValue Sel = Op.getOperand(0); |
| SDValue Other = Op.getOperand(1); |
| SDLoc dl(Sel); |
| |
| // If the operand is an overflow checking operation, invert the condition |
| // code and kill the Not operation. I.e., transform: |
| // (xor (overflow_op_bool, 1)) |
| // --> |
| // (csel 1, 0, invert(cc), overflow_op_bool) |
| // ... which later gets transformed to just a cset instruction with an |
| // inverted condition code, rather than a cset + eor sequence. |
| if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) |
| return SDValue(); |
| |
| SDValue TVal = DAG.getConstant(1, dl, MVT::i32); |
| SDValue FVal = DAG.getConstant(0, dl, MVT::i32); |
| AArch64CC::CondCode CC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); |
| SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, |
| CCVal, Overflow); |
| } |
| // If neither operand is a SELECT_CC, give up. |
| if (Sel.getOpcode() != ISD::SELECT_CC) |
| std::swap(Sel, Other); |
| if (Sel.getOpcode() != ISD::SELECT_CC) |
| return Op; |
| |
| // The folding we want to perform is: |
| // (xor x, (select_cc a, b, cc, 0, -1) ) |
| // --> |
| // (csel x, (xor x, -1), cc ...) |
| // |
| // The latter will get matched to a CSINV instruction. |
| |
| ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); |
| SDValue LHS = Sel.getOperand(0); |
| SDValue RHS = Sel.getOperand(1); |
| SDValue TVal = Sel.getOperand(2); |
| SDValue FVal = Sel.getOperand(3); |
| |
| // FIXME: This could be generalized to non-integer comparisons. |
| if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) |
| return Op; |
| |
| ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); |
| ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); |
| |
| // The values aren't constants, this isn't the pattern we're looking for. |
| if (!CFVal || !CTVal) |
| return Op; |
| |
| // We can commute the SELECT_CC by inverting the condition. This |
| // might be needed to make this fit into a CSINV pattern. |
| if (CTVal->isAllOnes() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| |
| // If the constants line up, perform the transform! |
| if (CTVal->isZero() && CFVal->isAllOnes()) { |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| |
| FVal = Other; |
| TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, |
| DAG.getConstant(-1ULL, dl, Other.getValueType())); |
| |
| return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, |
| CCVal, Cmp); |
| } |
| |
| return Op; |
| } |
| |
| static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { |
| EVT VT = Op.getValueType(); |
| |
| // Let legalize expand this if it isn't a legal type yet. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) |
| return SDValue(); |
| |
| SDVTList VTs = DAG.getVTList(VT, MVT::i32); |
| |
| unsigned Opc; |
| bool ExtraOp = false; |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("Invalid code"); |
| case ISD::ADDC: |
| Opc = AArch64ISD::ADDS; |
| break; |
| case ISD::SUBC: |
| Opc = AArch64ISD::SUBS; |
| break; |
| case ISD::ADDE: |
| Opc = AArch64ISD::ADCS; |
| ExtraOp = true; |
| break; |
| case ISD::SUBE: |
| Opc = AArch64ISD::SBCS; |
| ExtraOp = true; |
| break; |
| } |
| |
| if (!ExtraOp) |
| return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); |
| return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), |
| Op.getOperand(2)); |
| } |
| |
| static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { |
| // Let legalize expand this if it isn't a legal type yet. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) |
| return SDValue(); |
| |
| SDLoc dl(Op); |
| AArch64CC::CondCode CC; |
| // The actual operation that sets the overflow or carry flag. |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); |
| |
| // We use 0 and 1 as false and true values. |
| SDValue TVal = DAG.getConstant(1, dl, MVT::i32); |
| SDValue FVal = DAG.getConstant(0, dl, MVT::i32); |
| |
| // We use an inverted condition, because the conditional select is inverted |
| // too. This will allow it to be selected to a single instruction: |
| // CSINC Wd, WZR, WZR, invert(cond). |
| SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); |
| Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, |
| CCVal, Overflow); |
| |
| SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); |
| return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); |
| } |
| |
| // Prefetch operands are: |
| // 1: Address to prefetch |
| // 2: bool isWrite |
| // 3: int locality (0 = no locality ... 3 = extreme locality) |
| // 4: bool isDataCache |
| static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); |
| unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); |
| unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); |
| |
| bool IsStream = !Locality; |
| // When the locality number is set |
| if (Locality) { |
| // The front-end should have filtered out the out-of-range values |
| assert(Locality <= 3 && "Prefetch locality out-of-range"); |
| // The locality degree is the opposite of the cache speed. |
| // Put the number the other way around. |
| // The encoding starts at 0 for level 1 |
| Locality = 3 - Locality; |
| } |
| |
| // built the mask value encoding the expected behavior. |
| unsigned PrfOp = (IsWrite << 4) | // Load/Store bit |
| (!IsData << 3) | // IsDataCache bit |
| (Locality << 1) | // Cache level bits |
| (unsigned)IsStream; // Stream bit |
| return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), |
| DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| if (VT.isScalableVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); |
| |
| if (useSVEForFixedLengthVectorVT(VT)) |
| return LowerFixedLengthFPExtendToSVE(Op, DAG); |
| |
| assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (Op.getValueType().isScalableVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| EVT SrcVT = SrcVal.getValueType(); |
| |
| if (useSVEForFixedLengthVectorVT(SrcVT)) |
| return LowerFixedLengthFPRoundToSVE(Op, DAG); |
| |
| if (SrcVT != MVT::f128) { |
| // Expand cases where the input is a vector bigger than NEON. |
| if (useSVEForFixedLengthVectorVT(SrcVT)) |
| return SDValue(); |
| |
| // It's legal except when f128 is involved |
| return Op; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. |
| // Any additional optimization in this function should be recorded |
| // in the cost tables. |
| EVT InVT = Op.getOperand(0).getValueType(); |
| EVT VT = Op.getValueType(); |
| |
| if (VT.isScalableVector()) { |
| unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT |
| ? AArch64ISD::FCVTZU_MERGE_PASSTHRU |
| : AArch64ISD::FCVTZS_MERGE_PASSTHRU; |
| return LowerToPredicatedOp(Op, DAG, Opcode); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) |
| return LowerFixedLengthFPToIntToSVE(Op, DAG); |
| |
| unsigned NumElts = InVT.getVectorNumElements(); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (InVT.getVectorElementType() == MVT::f16 && |
| !Subtarget->hasFullFP16()) { |
| MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); |
| SDLoc dl(Op); |
| return DAG.getNode( |
| Op.getOpcode(), dl, Op.getValueType(), |
| DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); |
| } |
| |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| uint64_t InVTSize = InVT.getFixedSizeInBits(); |
| if (VTSize < InVTSize) { |
| SDLoc dl(Op); |
| SDValue Cv = |
| DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), |
| Op.getOperand(0)); |
| return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); |
| } |
| |
| if (VTSize > InVTSize) { |
| SDLoc dl(Op); |
| MVT ExtVT = |
| MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), |
| VT.getVectorNumElements()); |
| SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); |
| return DAG.getNode(Op.getOpcode(), dl, VT, Ext); |
| } |
| |
| // Type changing conversions are illegal. |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, |
| SelectionDAG &DAG) const { |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| |
| if (SrcVal.getValueType().isVector()) |
| return LowerVectorFP_TO_INT(Op, DAG); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { |
| assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); |
| SDLoc dl(Op); |
| return DAG.getNode( |
| Op.getOpcode(), dl, Op.getValueType(), |
| DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); |
| } |
| |
| if (SrcVal.getValueType() != MVT::f128) { |
| // It's legal except when f128 is involved |
| return Op; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AArch64 FP-to-int conversions saturate to the destination element size, so |
| // we can lower common saturating conversions to simple instructions. |
| SDValue SrcVal = Op.getOperand(0); |
| EVT SrcVT = SrcVal.getValueType(); |
| EVT DstVT = Op.getValueType(); |
| EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| |
| uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits(); |
| uint64_t DstElementWidth = DstVT.getScalarSizeInBits(); |
| uint64_t SatWidth = SatVT.getScalarSizeInBits(); |
| assert(SatWidth <= DstElementWidth && |
| "Saturation width cannot exceed result width"); |
| |
| // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. |
| // Currently, the `llvm.fpto[su]i.sat.*` instrinsics don't accept scalable |
| // types, so this is hard to reach. |
| if (DstVT.isScalableVector()) |
| return SDValue(); |
| |
| EVT SrcElementVT = SrcVT.getVectorElementType(); |
| |
| // In the absence of FP16 support, promote f16 to f32 and saturate the result. |
| if (SrcElementVT == MVT::f16 && |
| (!Subtarget->hasFullFP16() || DstElementWidth > 16)) { |
| MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements()); |
| SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal); |
| SrcVT = F32VT; |
| SrcElementVT = MVT::f32; |
| SrcElementWidth = 32; |
| } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 && |
| SrcElementVT != MVT::f16) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| // Cases that we can emit directly. |
| if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) |
| return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, |
| DAG.getValueType(DstVT.getScalarType())); |
| |
| // Otherwise we emit a cvt that saturates to a higher BW, and saturate the |
| // result. This is only valid if the legal cvt is larger than the saturate |
| // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize |
| // (at least until sqxtn is selected). |
| if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64) |
| return SDValue(); |
| |
| EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); |
| SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal, |
| DAG.getValueType(IntVT.getScalarType())); |
| SDValue Sat; |
| if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { |
| SDValue MinC = DAG.getConstant( |
| APInt::getSignedMaxValue(SatWidth).sextOrSelf(SrcElementWidth), DL, |
| IntVT); |
| SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); |
| SDValue MaxC = DAG.getConstant( |
| APInt::getSignedMinValue(SatWidth).sextOrSelf(SrcElementWidth), DL, |
| IntVT); |
| Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); |
| } else { |
| SDValue MinC = DAG.getConstant( |
| APInt::getAllOnesValue(SatWidth).zextOrSelf(SrcElementWidth), DL, |
| IntVT); |
| Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); |
| } |
| |
| return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AArch64 FP-to-int conversions saturate to the destination register size, so |
| // we can lower common saturating conversions to simple instructions. |
| SDValue SrcVal = Op.getOperand(0); |
| EVT SrcVT = SrcVal.getValueType(); |
| |
| if (SrcVT.isVector()) |
| return LowerVectorFP_TO_INT_SAT(Op, DAG); |
| |
| EVT DstVT = Op.getValueType(); |
| EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| uint64_t SatWidth = SatVT.getScalarSizeInBits(); |
| uint64_t DstWidth = DstVT.getScalarSizeInBits(); |
| assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); |
| |
| // In the absence of FP16 support, promote f16 to f32 and saturate the result. |
| if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) { |
| SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal); |
| SrcVT = MVT::f32; |
| } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| // Cases that we can emit directly. |
| if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || |
| (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && |
| DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32)) |
| return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, |
| DAG.getValueType(DstVT)); |
| |
| // Otherwise we emit a cvt that saturates to a higher BW, and saturate the |
| // result. This is only valid if the legal cvt is larger than the saturate |
| // width. |
| if (DstWidth < SatWidth) |
| return SDValue(); |
| |
| SDValue NativeCvt = |
| DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); |
| SDValue Sat; |
| if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { |
| SDValue MinC = DAG.getConstant( |
| APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT); |
| SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); |
| SDValue MaxC = DAG.getConstant( |
| APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT); |
| Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); |
| } else { |
| SDValue MinC = DAG.getConstant( |
| APInt::getAllOnesValue(SatWidth).zextOrSelf(DstWidth), DL, DstVT); |
| Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); |
| } |
| |
| return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. |
| // Any additional optimization in this function should be recorded |
| // in the cost tables. |
| EVT VT = Op.getValueType(); |
| SDLoc dl(Op); |
| SDValue In = Op.getOperand(0); |
| EVT InVT = In.getValueType(); |
| unsigned Opc = Op.getOpcode(); |
| bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; |
| |
| if (VT.isScalableVector()) { |
| if (InVT.getVectorElementType() == MVT::i1) { |
| // We can't directly extend an SVE predicate; extend it first. |
| unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| EVT CastVT = getPromotedVTForPredicate(InVT); |
| In = DAG.getNode(CastOpc, dl, CastVT, In); |
| return DAG.getNode(Opc, dl, VT, In); |
| } |
| |
| unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU |
| : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; |
| return LowerToPredicatedOp(Op, DAG, Opcode); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) |
| return LowerFixedLengthIntToFPToSVE(Op, DAG); |
| |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| uint64_t InVTSize = InVT.getFixedSizeInBits(); |
| if (VTSize < InVTSize) { |
| MVT CastVT = |
| MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), |
| InVT.getVectorNumElements()); |
| In = DAG.getNode(Opc, dl, CastVT, In); |
| return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); |
| } |
| |
| if (VTSize > InVTSize) { |
| unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
| EVT CastVT = VT.changeVectorElementTypeToInteger(); |
| In = DAG.getNode(CastOpc, dl, CastVT, In); |
| return DAG.getNode(Opc, dl, VT, In); |
| } |
| |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (Op.getValueType().isVector()) |
| return LowerVectorINT_TO_FP(Op, DAG); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); |
| |
| // f16 conversions are promoted to f32 when full fp16 is not supported. |
| if (Op.getValueType() == MVT::f16 && |
| !Subtarget->hasFullFP16()) { |
| assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); |
| SDLoc dl(Op); |
| return DAG.getNode( |
| ISD::FP_ROUND, dl, MVT::f16, |
| DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), |
| DAG.getIntPtrConstant(0, dl)); |
| } |
| |
| // i128 conversions are libcalls. |
| if (SrcVal.getValueType() == MVT::i128) |
| return SDValue(); |
| |
| // Other conversions are legal, unless it's to the completely software-based |
| // fp128. |
| if (Op.getValueType() != MVT::f128) |
| return Op; |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, |
| SelectionDAG &DAG) const { |
| // For iOS, we want to call an alternative entry point: __sincos_stret, |
| // which returns the values in two S / D registers. |
| SDLoc dl(Op); |
| SDValue Arg = Op.getOperand(0); |
| EVT ArgVT = Arg.getValueType(); |
| Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); |
| |
| ArgListTy Args; |
| ArgListEntry Entry; |
| |
| Entry.Node = Arg; |
| Entry.Ty = ArgTy; |
| Entry.IsSExt = false; |
| Entry.IsZExt = false; |
| Args.push_back(Entry); |
| |
| RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 |
| : RTLIB::SINCOS_STRET_F32; |
| const char *LibcallName = getLibcallName(LC); |
| SDValue Callee = |
| DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); |
| |
| StructType *RetTy = StructType::get(ArgTy, ArgTy); |
| TargetLowering::CallLoweringInfo CLI(DAG); |
| CLI.setDebugLoc(dl) |
| .setChain(DAG.getEntryNode()) |
| .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); |
| |
| std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); |
| return CallResult.first; |
| } |
| |
| static MVT getSVEContainerType(EVT ContentTy); |
| |
| SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT OpVT = Op.getValueType(); |
| EVT ArgVT = Op.getOperand(0).getValueType(); |
| |
| if (useSVEForFixedLengthVectorVT(OpVT)) |
| return LowerFixedLengthBitcastToSVE(Op, DAG); |
| |
| if (OpVT.isScalableVector()) { |
| if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { |
| assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && |
| "Expected int->fp bitcast!"); |
| SDValue ExtResult = |
| DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), |
| Op.getOperand(0)); |
| return getSVESafeBitCast(OpVT, ExtResult, DAG); |
| } |
| return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); |
| } |
| |
| if (OpVT != MVT::f16 && OpVT != MVT::bf16) |
| return SDValue(); |
| |
| assert(ArgVT == MVT::i16); |
| SDLoc DL(Op); |
| |
| Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); |
| Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); |
| return SDValue( |
| DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, |
| DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), |
| 0); |
| } |
| |
| static EVT getExtensionTo64Bits(const EVT &OrigVT) { |
| if (OrigVT.getSizeInBits() >= 64) |
| return OrigVT; |
| |
| assert(OrigVT.isSimple() && "Expecting a simple value type"); |
| |
| MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; |
| switch (OrigSimpleTy) { |
| default: llvm_unreachable("Unexpected Vector Type"); |
| case MVT::v2i8: |
| case MVT::v2i16: |
| return MVT::v2i32; |
| case MVT::v4i8: |
| return MVT::v4i16; |
| } |
| } |
| |
| static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, |
| const EVT &OrigTy, |
| const EVT &ExtTy, |
| unsigned ExtOpcode) { |
| // The vector originally had a size of OrigTy. It was then extended to ExtTy. |
| // We expect the ExtTy to be 128-bits total. If the OrigTy is less than |
| // 64-bits we need to insert a new extension so that it will be 64-bits. |
| assert(ExtTy.is128BitVector() && "Unexpected extension size"); |
| if (OrigTy.getSizeInBits() >= 64) |
| return N; |
| |
| // Must extend size to at least 64 bits to be used as an operand for VMULL. |
| EVT NewVT = getExtensionTo64Bits(OrigTy); |
| |
| return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); |
| } |
| |
| static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, |
| bool isSigned) { |
| EVT VT = N->getValueType(0); |
| |
| if (N->getOpcode() != ISD::BUILD_VECTOR) |
| return false; |
| |
| for (const SDValue &Elt : N->op_values()) { |
| if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { |
| unsigned EltSize = VT.getScalarSizeInBits(); |
| unsigned HalfSize = EltSize / 2; |
| if (isSigned) { |
| if (!isIntN(HalfSize, C->getSExtValue())) |
| return false; |
| } else { |
| if (!isUIntN(HalfSize, C->getZExtValue())) |
| return false; |
| } |
| continue; |
| } |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { |
| if (N->getOpcode() == ISD::SIGN_EXTEND || |
| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) |
| return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, |
| N->getOperand(0)->getValueType(0), |
| N->getValueType(0), |
| N->getOpcode()); |
| |
| assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); |
| EVT VT = N->getValueType(0); |
| SDLoc dl(N); |
| unsigned EltSize = VT.getScalarSizeInBits() / 2; |
| unsigned NumElts = VT.getVectorNumElements(); |
| MVT TruncVT = MVT::getIntegerVT(EltSize); |
| SmallVector<SDValue, 8> Ops; |
| for (unsigned i = 0; i != NumElts; ++i) { |
| ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); |
| const APInt &CInt = C->getAPIntValue(); |
| // Element types smaller than 32 bits are not legal, so use i32 elements. |
| // The values are implicitly truncated so sext vs. zext doesn't matter. |
| Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); |
| } |
| return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); |
| } |
| |
| static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { |
| return N->getOpcode() == ISD::SIGN_EXTEND || |
| N->getOpcode() == ISD::ANY_EXTEND || |
| isExtendedBUILD_VECTOR(N, DAG, true); |
| } |
| |
| static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { |
| return N->getOpcode() == ISD::ZERO_EXTEND || |
| N->getOpcode() == ISD::ANY_EXTEND || |
| isExtendedBUILD_VECTOR(N, DAG, false); |
| } |
| |
| static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { |
| unsigned Opcode = N->getOpcode(); |
| if (Opcode == ISD::ADD || Opcode == ISD::SUB) { |
| SDNode *N0 = N->getOperand(0).getNode(); |
| SDNode *N1 = N->getOperand(1).getNode(); |
| return N0->hasOneUse() && N1->hasOneUse() && |
| isSignExtended(N0, DAG) && isSignExtended(N1, DAG); |
| } |
| return false; |
| } |
| |
| static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { |
| unsigned Opcode = N->getOpcode(); |
| if (Opcode == ISD::ADD || Opcode == ISD::SUB) { |
| SDNode *N0 = N->getOperand(0).getNode(); |
| SDNode *N1 = N->getOperand(1).getNode(); |
| return N0->hasOneUse() && N1->hasOneUse() && |
| isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); |
| } |
| return false; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, |
| SelectionDAG &DAG) const { |
| // The rounding mode is in bits 23:22 of the FPSCR. |
| // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 |
| // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) |
| // so that the shift + and get folded into a bitfield extract. |
| SDLoc dl(Op); |
| |
| SDValue Chain = Op.getOperand(0); |
| SDValue FPCR_64 = DAG.getNode( |
| ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, |
| {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); |
| Chain = FPCR_64.getValue(1); |
| SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); |
| SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, |
| DAG.getConstant(1U << 22, dl, MVT::i32)); |
| SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, |
| DAG.getConstant(22, dl, MVT::i32)); |
| SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, |
| DAG.getConstant(3, dl, MVT::i32)); |
| return DAG.getMergeValues({AND, Chain}, dl); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| SDValue Chain = Op->getOperand(0); |
| SDValue RMValue = Op->getOperand(1); |
| |
| // The rounding mode is in bits 23:22 of the FPCR. |
| // The llvm.set.rounding argument value to the rounding mode in FPCR mapping |
| // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is |
| // ((arg - 1) & 3) << 22). |
| // |
| // The argument of llvm.set.rounding must be within the segment [0, 3], so |
| // NearestTiesToAway (4) is not handled here. It is responsibility of the code |
| // generated llvm.set.rounding to ensure this condition. |
| |
| // Calculate new value of FPCR[23:22]. |
| RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, |
| DAG.getConstant(1, DL, MVT::i32)); |
| RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, |
| DAG.getConstant(0x3, DL, MVT::i32)); |
| RMValue = |
| DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, |
| DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); |
| RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); |
| |
| // Get current value of FPCR. |
| SDValue Ops[] = { |
| Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; |
| SDValue FPCR = |
| DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); |
| Chain = FPCR.getValue(1); |
| FPCR = FPCR.getValue(0); |
| |
| // Put new rounding mode into FPSCR[23:22]. |
| const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); |
| FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, |
| DAG.getConstant(RMMask, DL, MVT::i64)); |
| FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); |
| SDValue Ops2[] = { |
| Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), |
| FPCR}; |
| return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); |
| } |
| |
| SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| // If SVE is available then i64 vector multiplications can also be made legal. |
| bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; |
| |
| if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); |
| |
| // Multiplications are only custom-lowered for 128-bit vectors so that |
| // VMULL can be detected. Otherwise v2i64 multiplications are not legal. |
| assert(VT.is128BitVector() && VT.isInteger() && |
| "unexpected type for custom-lowering ISD::MUL"); |
| SDNode *N0 = Op.getOperand(0).getNode(); |
| SDNode *N1 = Op.getOperand(1).getNode(); |
| unsigned NewOpc = 0; |
| bool isMLA = false; |
| bool isN0SExt = isSignExtended(N0, DAG); |
| bool isN1SExt = isSignExtended(N1, DAG); |
| if (isN0SExt && isN1SExt) |
| NewOpc = AArch64ISD::SMULL; |
| else { |
| bool isN0ZExt = isZeroExtended(N0, DAG); |
| bool isN1ZExt = isZeroExtended(N1, DAG); |
| if (isN0ZExt && isN1ZExt) |
| NewOpc = AArch64ISD::UMULL; |
| else if (isN1SExt || isN1ZExt) { |
| // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these |
| // into (s/zext A * s/zext C) + (s/zext B * s/zext C) |
| if (isN1SExt && isAddSubSExt(N0, DAG)) { |
| NewOpc = AArch64ISD::SMULL; |
| isMLA = true; |
| } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { |
| NewOpc = AArch64ISD::UMULL; |
| isMLA = true; |
| } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { |
| std::swap(N0, N1); |
| NewOpc = AArch64ISD::UMULL; |
| isMLA = true; |
| } |
| } |
| |
| if (!NewOpc) { |
| if (VT == MVT::v2i64) |
| // Fall through to expand this. It is not legal. |
| return SDValue(); |
| else |
| // Other vector multiplications are legal. |
| return Op; |
| } |
| } |
| |
| // Legalize to a S/UMULL instruction |
| SDLoc DL(Op); |
| SDValue Op0; |
| SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); |
| if (!isMLA) { |
| Op0 = skipExtensionForVectorMULL(N0, DAG); |
| assert(Op0.getValueType().is64BitVector() && |
| Op1.getValueType().is64BitVector() && |
| "unexpected types for extended operands to VMULL"); |
| return DAG.getNode(NewOpc, DL, VT, Op0, Op1); |
| } |
| // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during |
| // isel lowering to take advantage of no-stall back to back s/umul + s/umla. |
| // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 |
| SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); |
| SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); |
| EVT Op1VT = Op1.getValueType(); |
| return DAG.getNode(N0->getOpcode(), DL, VT, |
| DAG.getNode(NewOpc, DL, VT, |
| DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), |
| DAG.getNode(NewOpc, DL, VT, |
| DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); |
| } |
| |
| static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, |
| int Pattern) { |
| return DAG.getNode(AArch64ISD::PTRUE, DL, VT, |
| DAG.getTargetConstant(Pattern, DL, MVT::i32)); |
| } |
| |
| static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT OutVT = Op.getValueType(); |
| SDValue InOp = Op.getOperand(1); |
| EVT InVT = InOp.getValueType(); |
| |
| // Return the operand if the cast isn't changing type, |
| // i.e. <n x 16 x i1> -> <n x 16 x i1> |
| if (InVT == OutVT) |
| return InOp; |
| |
| SDValue Reinterpret = |
| DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp); |
| |
| // If the argument converted to an svbool is a ptrue or a comparison, the |
| // lanes introduced by the widening are zero by construction. |
| switch (InOp.getOpcode()) { |
| case AArch64ISD::SETCC_MERGE_ZERO: |
| return Reinterpret; |
| case ISD::INTRINSIC_WO_CHAIN: |
| if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue) |
| return Reinterpret; |
| } |
| |
| // Otherwise, zero the newly introduced lanes. |
| SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all); |
| SDValue MaskReinterpret = |
| DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask); |
| return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret); |
| } |
| |
| SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, |
| SelectionDAG &DAG) const { |
| unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDLoc dl(Op); |
| switch (IntNo) { |
| default: return SDValue(); // Don't custom lower most intrinsics. |
| case Intrinsic::thread_pointer: { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); |
| } |
| case Intrinsic::aarch64_neon_abs: { |
| EVT Ty = Op.getValueType(); |
| if (Ty == MVT::i64) { |
| SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, |
| Op.getOperand(1)); |
| Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); |
| return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); |
| } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { |
| return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); |
| } else { |
| report_fatal_error("Unexpected type for AArch64 NEON intrinic"); |
| } |
| } |
| case Intrinsic::aarch64_neon_smax: |
| return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_umax: |
| return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_smin: |
| return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_neon_umin: |
| return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| |
| case Intrinsic::aarch64_sve_sunpkhi: |
| return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sunpklo: |
| return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uunpkhi: |
| return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uunpklo: |
| return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_clasta_n: |
| return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_clastb_n: |
| return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_lasta: |
| return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_lastb: |
| return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_rev: |
| return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_tbl: |
| return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_trn1: |
| return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_trn2: |
| return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_uzp1: |
| return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_uzp2: |
| return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_zip1: |
| return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_zip2: |
| return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_splice: |
| return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); |
| case Intrinsic::aarch64_sve_ptrue: |
| return getPTrue(DAG, dl, Op.getValueType(), |
| cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); |
| case Intrinsic::aarch64_sve_clz: |
| return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_cnt: { |
| SDValue Data = Op.getOperand(3); |
| // CTPOP only supports integer operands. |
| if (Data.getValueType().isFloatingPoint()) |
| Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); |
| return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Data, Op.getOperand(1)); |
| } |
| case Intrinsic::aarch64_sve_dupq_lane: |
| return LowerDUPQLane(Op, DAG); |
| case Intrinsic::aarch64_sve_convert_from_svbool: |
| return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_convert_to_svbool: |
| return lowerConvertToSVBool(Op, DAG); |
| case Intrinsic::aarch64_sve_fneg: |
| return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintp: |
| return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintm: |
| return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frinti: |
| return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintx: |
| return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frinta: |
| return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintn: |
| return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frintz: |
| return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_ucvtf: |
| return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_scvtf: |
| return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fcvtzu: |
| return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fcvtzs: |
| return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_fsqrt: |
| return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecpx: |
| return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecpe_x: |
| return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frecps_x: |
| return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_frsqrte_x: |
| return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_frsqrts_x: |
| return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), |
| Op.getOperand(1), Op.getOperand(2)); |
| case Intrinsic::aarch64_sve_fabs: |
| return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_abs: |
| return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_neg: |
| return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_insr: { |
| SDValue Scalar = Op.getOperand(2); |
| EVT ScalarTy = Scalar.getValueType(); |
| if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) |
| Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); |
| |
| return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), |
| Op.getOperand(1), Scalar); |
| } |
| case Intrinsic::aarch64_sve_rbit: |
| return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, |
| Op.getValueType(), Op.getOperand(2), Op.getOperand(3), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_revb: |
| return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxtb: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxth: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_sxtw: |
| return DAG.getNode( |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxtb: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxth: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), |
| Op.getOperand(1)); |
| case Intrinsic::aarch64_sve_uxtw: |
| return DAG.getNode( |
| AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), |
| Op.getOperand(2), Op.getOperand(3), |
| DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), |
| Op.getOperand(1)); |
| |
| case Intrinsic::localaddress: { |
| const auto &MF = DAG.getMachineFunction(); |
| const auto *RegInfo = Subtarget->getRegisterInfo(); |
| unsigned Reg = RegInfo->getLocalAddressRegister(MF); |
| return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, |
| Op.getSimpleValueType()); |
| } |
| |
| case Intrinsic::eh_recoverfp: { |
| // FIXME: This needs to be implemented to correctly handle highly aligned |
| // stack objects. For now we simply return the incoming FP. Refer D53541 |
| // for more details. |
| SDValue FnOp = Op.getOperand(1); |
| SDValue IncomingFPOp = Op.getOperand(2); |
| GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); |
| auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); |
| if (!Fn) |
| report_fatal_error( |
| "llvm.eh.recoverfp must take a function as the first argument"); |
| return IncomingFPOp; |
| } |
| |
| case Intrinsic::aarch64_neon_vsri: |
| case Intrinsic::aarch64_neon_vsli: { |
| EVT Ty = Op.getValueType(); |
| |
| if (!Ty.isVector()) |
| report_fatal_error("Unexpected type for aarch64_neon_vsli"); |
| |
| assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); |
| |
| bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; |
| unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; |
| return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), |
| Op.getOperand(3)); |
| } |
| |
| case Intrinsic::aarch64_neon_srhadd: |
| case Intrinsic::aarch64_neon_urhadd: |
| case Intrinsic::aarch64_neon_shadd: |
| case Intrinsic::aarch64_neon_uhadd: { |
| bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || |
| IntNo == Intrinsic::aarch64_neon_shadd); |
| bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || |
| IntNo == Intrinsic::aarch64_neon_urhadd); |
| unsigned Opcode = |
| IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) |
| : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), |
| Op.getOperand(2)); |
| } |
| case Intrinsic::aarch64_neon_sabd: |
| case Intrinsic::aarch64_neon_uabd: { |
| unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU |
| : ISD::ABDS; |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), |
| Op.getOperand(2)); |
| } |
| case Intrinsic::aarch64_neon_uaddlp: { |
| unsigned Opcode = AArch64ISD::UADDLP; |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); |
| } |
| case Intrinsic::aarch64_neon_sdot: |
| case Intrinsic::aarch64_neon_udot: |
| case Intrinsic::aarch64_sve_sdot: |
| case Intrinsic::aarch64_sve_udot: { |
| unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || |
| IntNo == Intrinsic::aarch64_sve_udot) |
| ? AArch64ISD::UDOT |
| : AArch64ISD::SDOT; |
| return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), |
| Op.getOperand(2), Op.getOperand(3)); |
| } |
| case Intrinsic::get_active_lane_mask: { |
| SDValue ID = |
| DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, |
| Op.getOperand(1), Op.getOperand(2)); |
| } |
| } |
| } |
| |
| bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { |
| if (VT.getVectorElementType() == MVT::i8 || |
| VT.getVectorElementType() == MVT::i16) { |
| EltTy = MVT::i32; |
| return true; |
| } |
| return false; |
| } |
| |
| bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { |
| if (VT.getVectorElementType() == MVT::i32 && |
| VT.getVectorElementCount().getKnownMinValue() >= 4 && |
| !VT.isFixedLengthVector()) |
| return true; |
| |
| return false; |
| } |
| |
| bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { |
| return ExtVal.getValueType().isScalableVector() || |
| useSVEForFixedLengthVectorVT(ExtVal.getValueType(), |
| /*OverrideNEON=*/true); |
| } |
| |
| unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { |
| std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::GLD1_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::GLD1_UXTW_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::GLD1_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::GLD1_SXTW_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::GLD1_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::GLD1_SCALED_MERGE_ZERO}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, |
| }; |
| auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); |
| return AddrModes.find(Key)->second; |
| } |
| |
| unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { |
| std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::SST1_PRED}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::SST1_UXTW_PRED}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::SST1_PRED}, |
| {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::SST1_SXTW_PRED}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), |
| AArch64ISD::SST1_SCALED_PRED}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), |
| AArch64ISD::SST1_UXTW_SCALED_PRED}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), |
| AArch64ISD::SST1_SCALED_PRED}, |
| {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), |
| AArch64ISD::SST1_SXTW_SCALED_PRED}, |
| }; |
| auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); |
| return AddrModes.find(Key)->second; |
| } |
| |
| unsigned getSignExtendedGatherOpcode(unsigned Opcode) { |
| switch (Opcode) { |
| default: |
| llvm_unreachable("unimplemented opcode"); |
| return Opcode; |
| case AArch64ISD::GLD1_MERGE_ZERO: |
| return AArch64ISD::GLD1S_MERGE_ZERO; |
| case AArch64ISD::GLD1_IMM_MERGE_ZERO: |
| return AArch64ISD::GLD1S_IMM_MERGE_ZERO; |
| case AArch64ISD::GLD1_UXTW_MERGE_ZERO: |
| return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; |
| case AArch64ISD::GLD1_SXTW_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; |
| case AArch64ISD::GLD1_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; |
| case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; |
| case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: |
| return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; |
| } |
| } |
| |
| bool getGatherScatterIndexIsExtended(SDValue Index) { |
| unsigned Opcode = Index.getOpcode(); |
| if (Opcode == ISD::SIGN_EXTEND_INREG) |
| return true; |
| |
| if (Opcode == ISD::AND) { |
| SDValue Splat = Index.getOperand(1); |
| if (Splat.getOpcode() != ISD::SPLAT_VECTOR) |
| return false; |
| ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0)); |
| if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) |
| return false; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // If the base pointer of a masked gather or scatter is null, we |
| // may be able to swap BasePtr & Index and use the vector + register |
| // or vector + immediate addressing mode, e.g. |
| // VECTOR + REGISTER: |
| // getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) |
| // -> getelementptr %offset, <vscale x N x T> %indices |
| // VECTOR + IMMEDIATE: |
| // getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices) |
| // -> getelementptr #x, <vscale x N x T> %indices |
| void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, |
| unsigned &Opcode, bool IsGather, |
| SelectionDAG &DAG) { |
| if (!isNullConstant(BasePtr)) |
| return; |
| |
| // FIXME: This will not match for fixed vector type codegen as the nodes in |
| // question will have fixed<->scalable conversions around them. This should be |
| // moved to a DAG combine or complex pattern so that is executes after all of |
| // the fixed vector insert and extracts have been removed. This deficiency |
| // will result in a sub-optimal addressing mode being used, i.e. an ADD not |
| // being folded into the scatter/gather. |
| ConstantSDNode *Offset = nullptr; |
| if (Index.getOpcode() == ISD::ADD) |
| if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) { |
| if (isa<ConstantSDNode>(SplatVal)) |
| Offset = cast<ConstantSDNode>(SplatVal); |
| else { |
| BasePtr = SplatVal; |
| Index = Index->getOperand(0); |
| return; |
| } |
| } |
| |
| unsigned NewOp = |
| IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED; |
| |
| if (!Offset) { |
| std::swap(BasePtr, Index); |
| Opcode = NewOp; |
| return; |
| } |
| |
| uint64_t OffsetVal = Offset->getZExtValue(); |
| unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8; |
| auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64); |
| |
| if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) { |
| // Index is out of range for the immediate addressing mode |
| BasePtr = ConstOffset; |
| Index = Index->getOperand(0); |
| return; |
| } |
| |
| // Immediate is in range |
| Opcode = NewOp; |
| BasePtr = Index->getOperand(0); |
| Index = ConstOffset; |
| } |
| |
| SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); |
| assert(MGT && "Can only custom lower gather load nodes"); |
| |
| bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector(); |
| |
| SDValue Index = MGT->getIndex(); |
| SDValue Chain = MGT->getChain(); |
| SDValue PassThru = MGT->getPassThru(); |
| SDValue Mask = MGT->getMask(); |
| SDValue BasePtr = MGT->getBasePtr(); |
| ISD::LoadExtType ExtTy = MGT->getExtensionType(); |
| |
| ISD::MemIndexType IndexType = MGT->getIndexType(); |
| bool IsScaled = |
| IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; |
| bool IsSigned = |
| IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; |
| bool IdxNeedsExtend = |
| getGatherScatterIndexIsExtended(Index) || |
| Index.getSimpleValueType().getVectorElementType() == MVT::i32; |
| bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; |
| |
| EVT VT = PassThru.getSimpleValueType(); |
| EVT IndexVT = Index.getSimpleValueType(); |
| EVT MemVT = MGT->getMemoryVT(); |
| SDValue InputVT = DAG.getValueType(MemVT); |
| |
| if (VT.getVectorElementType() == MVT::bf16 && |
| !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) |
| return SDValue(); |
| |
| if (IsFixedLength) { |
| assert(Subtarget->useSVEForFixedLengthVectors() && |
| "Cannot lower when not using SVE for fixed vectors"); |
| if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { |
| IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); |
| MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); |
| } else { |
| MemVT = getContainerForFixedLengthVector(DAG, MemVT); |
| IndexVT = MemVT.changeTypeToInteger(); |
| } |
| InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); |
| Mask = DAG.getNode( |
| ISD::SIGN_EXTEND, DL, |
| VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); |
| } |
| |
| if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) |
| PassThru = SDValue(); |
| |
| if (VT.isFloatingPoint() && !IsFixedLength) { |
| // Handle FP data by using an integer gather and casting the result. |
| if (PassThru) { |
| EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount()); |
| PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG); |
| } |
| InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); |
| } |
| |
| SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other); |
| |
| if (getGatherScatterIndexIsExtended(Index)) |
| Index = Index.getOperand(0); |
| |
| unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); |
| selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, |
| /*isGather=*/true, DAG); |
| |
| if (ResNeedsSignExtend) |
| Opcode = getSignExtendedGatherOpcode(Opcode); |
| |
| if (IsFixedLength) { |
| if (Index.getSimpleValueType().isFixedLengthVector()) |
| Index = convertToScalableVector(DAG, IndexVT, Index); |
| if (BasePtr.getSimpleValueType().isFixedLengthVector()) |
| BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); |
| Mask = convertFixedMaskToScalableVector(Mask, DAG); |
| } |
| |
| SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT}; |
| SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops); |
| Chain = Result.getValue(1); |
| |
| if (IsFixedLength) { |
| Result = convertFromScalableVector( |
| DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()), |
| Result); |
| Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result); |
| Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); |
| |
| if (PassThru) |
| Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru); |
| } else { |
| if (PassThru) |
| Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru); |
| |
| if (VT.isFloatingPoint()) |
| Result = getSVESafeBitCast(VT, Result, DAG); |
| } |
| |
| return DAG.getMergeValues({Result, Chain}, DL); |
| } |
| |
| SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); |
| assert(MSC && "Can only custom lower scatter store nodes"); |
| |
| bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector(); |
| |
| SDValue Index = MSC->getIndex(); |
| SDValue Chain = MSC->getChain(); |
| SDValue StoreVal = MSC->getValue(); |
| SDValue Mask = MSC->getMask(); |
| SDValue BasePtr = MSC->getBasePtr(); |
| |
| ISD::MemIndexType IndexType = MSC->getIndexType(); |
| bool IsScaled = |
| IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; |
| bool IsSigned = |
| IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; |
| bool NeedsExtend = |
| getGatherScatterIndexIsExtended(Index) || |
| Index.getSimpleValueType().getVectorElementType() == MVT::i32; |
| |
| EVT VT = StoreVal.getSimpleValueType(); |
| EVT IndexVT = Index.getSimpleValueType(); |
| SDVTList VTs = DAG.getVTList(MVT::Other); |
| EVT MemVT = MSC->getMemoryVT(); |
| SDValue InputVT = DAG.getValueType(MemVT); |
| |
| if (VT.getVectorElementType() == MVT::bf16 && |
| !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) |
| return SDValue(); |
| |
| if (IsFixedLength) { |
| assert(Subtarget->useSVEForFixedLengthVectors() && |
| "Cannot lower when not using SVE for fixed vectors"); |
| if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { |
| IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); |
| MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); |
| } else { |
| MemVT = getContainerForFixedLengthVector(DAG, MemVT); |
| IndexVT = MemVT.changeTypeToInteger(); |
| } |
| InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); |
| |
| StoreVal = |
| DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal); |
| StoreVal = DAG.getNode( |
| ISD::ANY_EXTEND, DL, |
| VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); |
| StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); |
| Mask = DAG.getNode( |
| ISD::SIGN_EXTEND, DL, |
| VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); |
| } else if (VT.isFloatingPoint()) { |
| // Handle FP data by casting the data so an integer scatter can be used. |
| EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); |
| StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG); |
| InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); |
| } |
| |
| if (getGatherScatterIndexIsExtended(Index)) |
| Index = Index.getOperand(0); |
| |
| unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend); |
| selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, |
| /*isGather=*/false, DAG); |
| |
| if (IsFixedLength) { |
| if (Index.getSimpleValueType().isFixedLengthVector()) |
| Index = convertToScalableVector(DAG, IndexVT, Index); |
| if (BasePtr.getSimpleValueType().isFixedLengthVector()) |
| BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); |
| Mask = convertFixedMaskToScalableVector(Mask, DAG); |
| } |
| |
| SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; |
| return DAG.getNode(Opcode, DL, VTs, Ops); |
| } |
| |
| SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op); |
| assert(LoadNode && "Expected custom lowering of a masked load node"); |
| EVT VT = Op->getValueType(0); |
| |
| if (useSVEForFixedLengthVectorVT(VT, true)) |
| return LowerFixedLengthVectorMLoadToSVE(Op, DAG); |
| |
| SDValue PassThru = LoadNode->getPassThru(); |
| SDValue Mask = LoadNode->getMask(); |
| |
| if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) |
| return Op; |
| |
| SDValue Load = DAG.getMaskedLoad( |
| VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), |
| LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), |
| LoadNode->getMemOperand(), LoadNode->getAddressingMode(), |
| LoadNode->getExtensionType()); |
| |
| SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); |
| |
| return DAG.getMergeValues({Result, Load.getValue(1)}, DL); |
| } |
| |
| // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. |
| static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, |
| EVT VT, EVT MemVT, |
| SelectionDAG &DAG) { |
| assert(VT.isVector() && "VT should be a vector type"); |
| assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); |
| |
| SDValue Value = ST->getValue(); |
| |
| // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract |
| // the word lane which represent the v4i8 subvector. It optimizes the store |
| // to: |
| // |
| // xtn v0.8b, v0.8h |
| // str s0, [x0] |
| |
| SDValue Undef = DAG.getUNDEF(MVT::i16); |
| SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, |
| {Undef, Undef, Undef, Undef}); |
| |
| SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, |
| Value, UndefVec); |
| SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); |
| |
| Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); |
| SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, |
| Trunc, DAG.getConstant(0, DL, MVT::i64)); |
| |
| return DAG.getStore(ST->getChain(), DL, ExtractTrunc, |
| ST->getBasePtr(), ST->getMemOperand()); |
| } |
| |
| // Custom lowering for any store, vector or scalar and/or default or with |
| // a truncate operations. Currently only custom lower truncate operation |
| // from vector v4i16 to v4i8 or volatile stores of i128. |
| SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc Dl(Op); |
| StoreSDNode *StoreNode = cast<StoreSDNode>(Op); |
| assert (StoreNode && "Can only custom lower store nodes"); |
| |
| SDValue Value = StoreNode->getValue(); |
| |
| EVT VT = Value.getValueType(); |
| EVT MemVT = StoreNode->getMemoryVT(); |
| |
| if (VT.isVector()) { |
| if (useSVEForFixedLengthVectorVT(VT, true)) |
| return LowerFixedLengthVectorStoreToSVE(Op, DAG); |
| |
| unsigned AS = StoreNode->getAddressSpace(); |
| Align Alignment = StoreNode->getAlign(); |
| if (Alignment < MemVT.getStoreSize() && |
| !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, |
| StoreNode->getMemOperand()->getFlags(), |
| nullptr)) { |
| return scalarizeVectorStore(StoreNode, DAG); |
| } |
| |
| if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && |
| MemVT == MVT::v4i8) { |
| return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); |
| } |
| // 256 bit non-temporal stores can be lowered to STNP. Do this as part of |
| // the custom lowering, as there are no un-paired non-temporal stores and |
| // legalization will break up 256 bit inputs. |
| ElementCount EC = MemVT.getVectorElementCount(); |
| if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && |
| EC.isKnownEven() && |
| ((MemVT.getScalarSizeInBits() == 8u || |
| MemVT.getScalarSizeInBits() == 16u || |
| MemVT.getScalarSizeInBits() == 32u || |
| MemVT.getScalarSizeInBits() == 64u))) { |
| SDValue Lo = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, |
| MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), |
| StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); |
| SDValue Hi = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, |
| MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), |
| StoreNode->getValue(), |
| DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); |
| SDValue Result = DAG.getMemIntrinsicNode( |
| AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), |
| {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, |
| StoreNode->getMemoryVT(), StoreNode->getMemOperand()); |
| return Result; |
| } |
| } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { |
| return LowerStore128(Op, DAG); |
| } else if (MemVT == MVT::i64x8) { |
| SDValue Value = StoreNode->getValue(); |
| assert(Value->getValueType(0) == MVT::i64x8); |
| SDValue Chain = StoreNode->getChain(); |
| SDValue Base = StoreNode->getBasePtr(); |
| EVT PtrVT = Base.getValueType(); |
| for (unsigned i = 0; i < 8; i++) { |
| SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, |
| Value, DAG.getConstant(i, Dl, MVT::i32)); |
| SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, |
| DAG.getConstant(i * 8, Dl, PtrVT)); |
| Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), |
| StoreNode->getOriginalAlign()); |
| } |
| return Chain; |
| } |
| |
| return SDValue(); |
| } |
| |
| /// Lower atomic or volatile 128-bit stores to a single STP instruction. |
| SDValue AArch64TargetLowering::LowerStore128(SDValue Op, |
| SelectionDAG &DAG) const { |
| MemSDNode *StoreNode = cast<MemSDNode>(Op); |
| assert(StoreNode->getMemoryVT() == MVT::i128); |
| assert(StoreNode->isVolatile() || StoreNode->isAtomic()); |
| assert(!StoreNode->isAtomic() || |
| StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || |
| StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); |
| |
| SDValue Value = StoreNode->getOpcode() == ISD::STORE |
| ? StoreNode->getOperand(1) |
| : StoreNode->getOperand(2); |
| SDLoc DL(Op); |
| SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, |
| DAG.getConstant(0, DL, MVT::i64)); |
| SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value, |
| DAG.getConstant(1, DL, MVT::i64)); |
| SDValue Result = DAG.getMemIntrinsicNode( |
| AArch64ISD::STP, DL, DAG.getVTList(MVT::Other), |
| {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, |
| StoreNode->getMemoryVT(), StoreNode->getMemOperand()); |
| return Result; |
| } |
| |
| SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| LoadSDNode *LoadNode = cast<LoadSDNode>(Op); |
| assert(LoadNode && "Expected custom lowering of a load node"); |
| |
| if (LoadNode->getMemoryVT() == MVT::i64x8) { |
| SmallVector<SDValue, 8> Ops; |
| SDValue Base = LoadNode->getBasePtr(); |
| SDValue Chain = LoadNode->getChain(); |
| EVT PtrVT = Base.getValueType(); |
| for (unsigned i = 0; i < 8; i++) { |
| SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, |
| DAG.getConstant(i * 8, DL, PtrVT)); |
| SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, |
| LoadNode->getPointerInfo(), |
| LoadNode->getOriginalAlign()); |
| Ops.push_back(Part); |
| Chain = SDValue(Part.getNode(), 1); |
| } |
| SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); |
| return DAG.getMergeValues({Loaded, Chain}, DL); |
| } |
| |
| // Custom lowering for extending v4i8 vector loads. |
| EVT VT = Op->getValueType(0); |
| assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); |
| |
| if (LoadNode->getMemoryVT() != MVT::v4i8) |
| return SDValue(); |
| |
| unsigned ExtType; |
| if (LoadNode->getExtensionType() == ISD::SEXTLOAD) |
| ExtType = ISD::SIGN_EXTEND; |
| else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || |
| LoadNode->getExtensionType() == ISD::EXTLOAD) |
| ExtType = ISD::ZERO_EXTEND; |
| else |
| return SDValue(); |
| |
| SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), |
| LoadNode->getBasePtr(), MachinePointerInfo()); |
| SDValue Chain = Load.getValue(1); |
| SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); |
| SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); |
| SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); |
| Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, |
| DAG.getConstant(0, DL, MVT::i64)); |
| if (VT == MVT::v4i32) |
| Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); |
| return DAG.getMergeValues({Ext, Chain}, DL); |
| } |
| |
| // Generate SUBS and CSEL for integer abs. |
| SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { |
| MVT VT = Op.getSimpleValueType(); |
| |
| if (VT.isVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); |
| |
| SDLoc DL(Op); |
| SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), |
| Op.getOperand(0)); |
| // Generate SUBS & CSEL. |
| SDValue Cmp = |
| DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), |
| Op.getOperand(0), DAG.getConstant(0, DL, VT)); |
| return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, |
| DAG.getConstant(AArch64CC::PL, DL, MVT::i32), |
| Cmp.getValue(1)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerOperation(SDValue Op, |
| SelectionDAG &DAG) const { |
| LLVM_DEBUG(dbgs() << "Custom lowering: "); |
| LLVM_DEBUG(Op.dump()); |
| |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("unimplemented operand"); |
| return SDValue(); |
| case ISD::BITCAST: |
| return LowerBITCAST(Op, DAG); |
| case ISD::GlobalAddress: |
| return LowerGlobalAddress(Op, DAG); |
| case ISD::GlobalTLSAddress: |
| return LowerGlobalTLSAddress(Op, DAG); |
| case ISD::SETCC: |
| case ISD::STRICT_FSETCC: |
| case ISD::STRICT_FSETCCS: |
| return LowerSETCC(Op, DAG); |
| case ISD::BR_CC: |
| return LowerBR_CC(Op, DAG); |
| case ISD::SELECT: |
| return LowerSELECT(Op, DAG); |
| case ISD::SELECT_CC: |
| return LowerSELECT_CC(Op, DAG); |
| case ISD::JumpTable: |
| return LowerJumpTable(Op, DAG); |
| case ISD::BR_JT: |
| return LowerBR_JT(Op, DAG); |
| case ISD::ConstantPool: |
| return LowerConstantPool(Op, DAG); |
| case ISD::BlockAddress: |
| return LowerBlockAddress(Op, DAG); |
| case ISD::VASTART: |
| return LowerVASTART(Op, DAG); |
| case ISD::VACOPY: |
| return LowerVACOPY(Op, DAG); |
| case ISD::VAARG: |
| return LowerVAARG(Op, DAG); |
| case ISD::ADDC: |
| case ISD::ADDE: |
| case ISD::SUBC: |
| case ISD::SUBE: |
| return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); |
| case ISD::SADDO: |
| case ISD::UADDO: |
| case ISD::SSUBO: |
| case ISD::USUBO: |
| case ISD::SMULO: |
| case ISD::UMULO: |
| return LowerXALUO(Op, DAG); |
| case ISD::FADD: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); |
| case ISD::FSUB: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); |
| case ISD::FMUL: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); |
| case ISD::FMA: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); |
| case ISD::FDIV: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); |
| case ISD::FNEG: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); |
| case ISD::FCEIL: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); |
| case ISD::FFLOOR: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); |
| case ISD::FNEARBYINT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); |
| case ISD::FRINT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); |
| case ISD::FROUND: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); |
| case ISD::FROUNDEVEN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); |
| case ISD::FTRUNC: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); |
| case ISD::FSQRT: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); |
| case ISD::FABS: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); |
| case ISD::FP_ROUND: |
| case ISD::STRICT_FP_ROUND: |
| return LowerFP_ROUND(Op, DAG); |
| case ISD::FP_EXTEND: |
| return LowerFP_EXTEND(Op, DAG); |
| case ISD::FRAMEADDR: |
| return LowerFRAMEADDR(Op, DAG); |
| case ISD::SPONENTRY: |
| return LowerSPONENTRY(Op, DAG); |
| case ISD::RETURNADDR: |
| return LowerRETURNADDR(Op, DAG); |
| case ISD::ADDROFRETURNADDR: |
| return LowerADDROFRETURNADDR(Op, DAG); |
| case ISD::CONCAT_VECTORS: |
| return LowerCONCAT_VECTORS(Op, DAG); |
| case ISD::INSERT_VECTOR_ELT: |
| return LowerINSERT_VECTOR_ELT(Op, DAG); |
| case ISD::EXTRACT_VECTOR_ELT: |
| return LowerEXTRACT_VECTOR_ELT(Op, DAG); |
| case ISD::BUILD_VECTOR: |
| return LowerBUILD_VECTOR(Op, DAG); |
| case ISD::VECTOR_SHUFFLE: |
| return LowerVECTOR_SHUFFLE(Op, DAG); |
| case ISD::SPLAT_VECTOR: |
| return LowerSPLAT_VECTOR(Op, DAG); |
| case ISD::EXTRACT_SUBVECTOR: |
| return LowerEXTRACT_SUBVECTOR(Op, DAG); |
| case ISD::INSERT_SUBVECTOR: |
| return LowerINSERT_SUBVECTOR(Op, DAG); |
| case ISD::SDIV: |
| case ISD::UDIV: |
| return LowerDIV(Op, DAG); |
| case ISD::SMIN: |
| case ISD::UMIN: |
| case ISD::SMAX: |
| case ISD::UMAX: |
| return LowerMinMax(Op, DAG); |
| case ISD::SRA: |
| case ISD::SRL: |
| case ISD::SHL: |
| return LowerVectorSRA_SRL_SHL(Op, DAG); |
| case ISD::SHL_PARTS: |
| case ISD::SRL_PARTS: |
| case ISD::SRA_PARTS: |
| return LowerShiftParts(Op, DAG); |
| case ISD::CTPOP: |
| return LowerCTPOP(Op, DAG); |
| case ISD::FCOPYSIGN: |
| return LowerFCOPYSIGN(Op, DAG); |
| case ISD::OR: |
| return LowerVectorOR(Op, DAG); |
| case ISD::XOR: |
| return LowerXOR(Op, DAG); |
| case ISD::PREFETCH: |
| return LowerPREFETCH(Op, DAG); |
| case ISD::SINT_TO_FP: |
| case ISD::UINT_TO_FP: |
| case ISD::STRICT_SINT_TO_FP: |
| case ISD::STRICT_UINT_TO_FP: |
| return LowerINT_TO_FP(Op, DAG); |
| case ISD::FP_TO_SINT: |
| case ISD::FP_TO_UINT: |
| case ISD::STRICT_FP_TO_SINT: |
| case ISD::STRICT_FP_TO_UINT: |
| return LowerFP_TO_INT(Op, DAG); |
| case ISD::FP_TO_SINT_SAT: |
| case ISD::FP_TO_UINT_SAT: |
| return LowerFP_TO_INT_SAT(Op, DAG); |
| case ISD::FSINCOS: |
| return LowerFSINCOS(Op, DAG); |
| case ISD::FLT_ROUNDS_: |
| return LowerFLT_ROUNDS_(Op, DAG); |
| case ISD::SET_ROUNDING: |
| return LowerSET_ROUNDING(Op, DAG); |
| case ISD::MUL: |
| return LowerMUL(Op, DAG); |
| case ISD::MULHS: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, |
| /*OverrideNEON=*/true); |
| case ISD::MULHU: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, |
| /*OverrideNEON=*/true); |
| case ISD::INTRINSIC_WO_CHAIN: |
| return LowerINTRINSIC_WO_CHAIN(Op, DAG); |
| case ISD::ATOMIC_STORE: |
| if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) { |
| assert(Subtarget->hasLSE2()); |
| return LowerStore128(Op, DAG); |
| } |
| return SDValue(); |
| case ISD::STORE: |
| return LowerSTORE(Op, DAG); |
| case ISD::MSTORE: |
| return LowerFixedLengthVectorMStoreToSVE(Op, DAG); |
| case ISD::MGATHER: |
| return LowerMGATHER(Op, DAG); |
| case ISD::MSCATTER: |
| return LowerMSCATTER(Op, DAG); |
| case ISD::VECREDUCE_SEQ_FADD: |
| return LowerVECREDUCE_SEQ_FADD(Op, DAG); |
| case ISD::VECREDUCE_ADD: |
| case ISD::VECREDUCE_AND: |
| case ISD::VECREDUCE_OR: |
| case ISD::VECREDUCE_XOR: |
| case ISD::VECREDUCE_SMAX: |
| case ISD::VECREDUCE_SMIN: |
| case ISD::VECREDUCE_UMAX: |
| case ISD::VECREDUCE_UMIN: |
| case ISD::VECREDUCE_FADD: |
| case ISD::VECREDUCE_FMAX: |
| case ISD::VECREDUCE_FMIN: |
| return LowerVECREDUCE(Op, DAG); |
| case ISD::ATOMIC_LOAD_SUB: |
| return LowerATOMIC_LOAD_SUB(Op, DAG); |
| case ISD::ATOMIC_LOAD_AND: |
| return LowerATOMIC_LOAD_AND(Op, DAG); |
| case ISD::DYNAMIC_STACKALLOC: |
| return LowerDYNAMIC_STACKALLOC(Op, DAG); |
| case ISD::VSCALE: |
| return LowerVSCALE(Op, DAG); |
| case ISD::ANY_EXTEND: |
| case ISD::SIGN_EXTEND: |
| case ISD::ZERO_EXTEND: |
| return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); |
| case ISD::SIGN_EXTEND_INREG: { |
| // Only custom lower when ExtraVT has a legal byte based element type. |
| EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
| EVT ExtraEltVT = ExtraVT.getVectorElementType(); |
| if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && |
| (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) |
| return SDValue(); |
| |
| return LowerToPredicatedOp(Op, DAG, |
| AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); |
| } |
| case ISD::TRUNCATE: |
| return LowerTRUNCATE(Op, DAG); |
| case ISD::MLOAD: |
| return LowerMLOAD(Op, DAG); |
| case ISD::LOAD: |
| if (useSVEForFixedLengthVectorVT(Op.getValueType())) |
| return LowerFixedLengthVectorLoadToSVE(Op, DAG); |
| return LowerLOAD(Op, DAG); |
| case ISD::ADD: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); |
| case ISD::AND: |
| return LowerToScalableOp(Op, DAG); |
| case ISD::SUB: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); |
| case ISD::FMAXIMUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); |
| case ISD::FMAXNUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); |
| case ISD::FMINIMUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); |
| case ISD::FMINNUM: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); |
| case ISD::VSELECT: |
| return LowerFixedLengthVectorSelectToSVE(Op, DAG); |
| case ISD::ABS: |
| return LowerABS(Op, DAG); |
| case ISD::BITREVERSE: |
| return LowerBitreverse(Op, DAG); |
| case ISD::BSWAP: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); |
| case ISD::CTLZ: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, |
| /*OverrideNEON=*/true); |
| case ISD::CTTZ: |
| return LowerCTTZ(Op, DAG); |
| case ISD::VECTOR_SPLICE: |
| return LowerVECTOR_SPLICE(Op, DAG); |
| } |
| } |
| |
| bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { |
| return !Subtarget->useSVEForFixedLengthVectors(); |
| } |
| |
| bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( |
| EVT VT, bool OverrideNEON) const { |
| if (!Subtarget->useSVEForFixedLengthVectors()) |
| return false; |
| |
| if (!VT.isFixedLengthVector()) |
| return false; |
| |
| // Don't use SVE for vectors we cannot scalarize if required. |
| switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { |
| // Fixed length predicates should be promoted to i8. |
| // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. |
| case MVT::i1: |
| default: |
| return false; |
| case MVT::i8: |
| case MVT::i16: |
| case MVT::i32: |
| case MVT::i64: |
| case MVT::f16: |
| case MVT::f32: |
| case MVT::f64: |
| break; |
| } |
| |
| // All SVE implementations support NEON sized vectors. |
| if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) |
| return true; |
| |
| // Ensure NEON MVTs only belong to a single register class. |
| if (VT.getFixedSizeInBits() <= 128) |
| return false; |
| |
| // Don't use SVE for types that don't fit. |
| if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) |
| return false; |
| |
| // TODO: Perhaps an artificial restriction, but worth having whilst getting |
| // the base fixed length SVE support in place. |
| if (!VT.isPow2VectorType()) |
| return false; |
| |
| return true; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Calling Convention Implementation |
| //===----------------------------------------------------------------------===// |
| |
| /// Selects the correct CCAssignFn for a given CallingConvention value. |
| CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, |
| bool IsVarArg) const { |
| switch (CC) { |
| default: |
| report_fatal_error("Unsupported calling convention."); |
| case CallingConv::WebKit_JS: |
| return CC_AArch64_WebKit_JS; |
| case CallingConv::GHC: |
| return CC_AArch64_GHC; |
| case CallingConv::C: |
| case CallingConv::Fast: |
| case CallingConv::PreserveMost: |
| case CallingConv::CXX_FAST_TLS: |
| case CallingConv::Swift: |
| case CallingConv::SwiftTail: |
| case CallingConv::Tail: |
| if (Subtarget->isTargetWindows() && IsVarArg) |
| return CC_AArch64_Win64_VarArg; |
| if (!Subtarget->isTargetDarwin()) |
| return CC_AArch64_AAPCS; |
| if (!IsVarArg) |
| return CC_AArch64_DarwinPCS; |
| return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg |
| : CC_AArch64_DarwinPCS_VarArg; |
| case CallingConv::Win64: |
| return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; |
| case CallingConv::CFGuard_Check: |
| return CC_AArch64_Win64_CFGuard_Check; |
| case CallingConv::AArch64_VectorCall: |
| case CallingConv::AArch64_SVE_VectorCall: |
| return CC_AArch64_AAPCS; |
| } |
| } |
| |
| CCAssignFn * |
| AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { |
| return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS |
| : RetCC_AArch64_AAPCS; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFormalArguments( |
| SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); |
| |
| // Assign locations to all of the incoming arguments. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| DenseMap<unsigned, SDValue> CopiedRegs; |
| CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| // At this point, Ins[].VT may already be promoted to i32. To correctly |
| // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and |
| // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. |
| // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here |
| // we use a special version of AnalyzeFormalArguments to pass in ValVT and |
| // LocVT. |
| unsigned NumArgs = Ins.size(); |
| Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); |
| unsigned CurArgIdx = 0; |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ValVT = Ins[i].VT; |
| if (Ins[i].isOrigArg()) { |
| std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); |
| CurArgIdx = Ins[i].getOrigArgIndex(); |
| |
| // Get type of the original argument. |
| EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), |
| /*AllowUnknown*/ true); |
| MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; |
| // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. |
| if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) |
| ValVT = MVT::i8; |
| else if (ActualMVT == MVT::i16) |
| ValVT = MVT::i16; |
| } |
| bool UseVarArgCC = false; |
| if (IsWin64) |
| UseVarArgCC = isVarArg; |
| CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); |
| bool Res = |
| AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); |
| assert(!Res && "Call operand has unhandled type"); |
| (void)Res; |
| } |
| SmallVector<SDValue, 16> ArgValues; |
| unsigned ExtraArgLocs = 0; |
| for (unsigned i = 0, e = Ins.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; |
| |
| if (Ins[i].Flags.isByVal()) { |
| // Byval is used for HFAs in the PCS, but the system should work in a |
| // non-compliant manner for larger structs. |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| int Size = Ins[i].Flags.getByValSize(); |
| unsigned NumRegs = (Size + 7) / 8; |
| |
| // FIXME: This works on big-endian for composite byvals, which are the common |
| // case. It should also work for fundamental types too. |
| unsigned FrameIdx = |
| MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); |
| SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); |
| InVals.push_back(FrameIdxN); |
| |
| continue; |
| } |
| |
| if (Ins[i].Flags.isSwiftAsync()) |
| MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); |
| |
| SDValue ArgValue; |
| if (VA.isRegLoc()) { |
| // Arguments stored in registers. |
| EVT RegVT = VA.getLocVT(); |
| const TargetRegisterClass *RC; |
| |
| if (RegVT == MVT::i32) |
| RC = &AArch64::GPR32RegClass; |
| else if (RegVT == MVT::i64) |
| RC = &AArch64::GPR64RegClass; |
| else if (RegVT == MVT::f16 || RegVT == MVT::bf16) |
| RC = &AArch64::FPR16RegClass; |
| else if (RegVT == MVT::f32) |
| RC = &AArch64::FPR32RegClass; |
| else if (RegVT == MVT::f64 || RegVT.is64BitVector()) |
| RC = &AArch64::FPR64RegClass; |
| else if (RegVT == MVT::f128 || RegVT.is128BitVector()) |
| RC = &AArch64::FPR128RegClass; |
| else if (RegVT.isScalableVector() && |
| RegVT.getVectorElementType() == MVT::i1) |
| RC = &AArch64::PPRRegClass; |
| else if (RegVT.isScalableVector()) |
| RC = &AArch64::ZPRRegClass; |
| else |
| llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); |
| |
| // Transform the arguments in physical registers into virtual ones. |
| unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); |
| ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); |
| |
| // If this is an 8, 16 or 32-bit value, it is really passed promoted |
| // to 64 bits. Insert an assert[sz]ext to capture this, then |
| // truncate to the right size. |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::Indirect: |
| assert(VA.getValVT().isScalableVector() && |
| "Only scalable vectors can be passed indirectly"); |
| break; |
| case CCValAssign::BCvt: |
| ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); |
| break; |
| case CCValAssign::AExt: |
| case CCValAssign::SExt: |
| case CCValAssign::ZExt: |
| break; |
| case CCValAssign::AExtUpper: |
| ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, |
| DAG.getConstant(32, DL, RegVT)); |
| ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); |
| break; |
| } |
| } else { // VA.isRegLoc() |
| assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); |
| unsigned ArgOffset = VA.getLocMemOffset(); |
| unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect |
| ? VA.getLocVT().getSizeInBits() |
| : VA.getValVT().getSizeInBits()) / 8; |
| |
| uint32_t BEAlign = 0; |
| if (!Subtarget->isLittleEndian() && ArgSize < 8 && |
| !Ins[i].Flags.isInConsecutiveRegs()) |
| BEAlign = 8 - ArgSize; |
| |
| int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); |
| |
| // Create load nodes to retrieve arguments from the stack. |
| SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); |
| |
| // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) |
| ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; |
| MVT MemVT = VA.getValVT(); |
| |
| switch (VA.getLocInfo()) { |
| default: |
| break; |
| case CCValAssign::Trunc: |
| case CCValAssign::BCvt: |
| MemVT = VA.getLocVT(); |
| break; |
| case CCValAssign::Indirect: |
| assert(VA.getValVT().isScalableVector() && |
| "Only scalable vectors can be passed indirectly"); |
| MemVT = VA.getLocVT(); |
| break; |
| case CCValAssign::SExt: |
| ExtType = ISD::SEXTLOAD; |
| break; |
| case CCValAssign::ZExt: |
| ExtType = ISD::ZEXTLOAD; |
| break; |
| case CCValAssign::AExt: |
| ExtType = ISD::EXTLOAD; |
| break; |
| } |
| |
| ArgValue = |
| DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, |
| MachinePointerInfo::getFixedStack(MF, FI), MemVT); |
| } |
| |
| if (VA.getLocInfo() == CCValAssign::Indirect) { |
| assert(VA.getValVT().isScalableVector() && |
| "Only scalable vectors can be passed indirectly"); |
| |
| uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); |
| unsigned NumParts = 1; |
| if (Ins[i].Flags.isInConsecutiveRegs()) { |
| assert(!Ins[i].Flags.isInConsecutiveRegsLast()); |
| while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) |
| ++NumParts; |
| } |
| |
| MVT PartLoad = VA.getValVT(); |
| SDValue Ptr = ArgValue; |
| |
| // Ensure we generate all loads for each tuple part, whilst updating the |
| // pointer after each load correctly using vscale. |
| while (NumParts > 0) { |
| ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); |
| InVals.push_back(ArgValue); |
| NumParts--; |
| if (NumParts > 0) { |
| SDValue BytesIncrement = DAG.getVScale( |
| DL, Ptr.getValueType(), |
| APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); |
| SDNodeFlags Flags; |
| Flags.setNoUnsignedWrap(true); |
| Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, |
| BytesIncrement, Flags); |
| ExtraArgLocs++; |
| i++; |
| } |
| } |
| } else { |
| if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) |
| ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), |
| ArgValue, DAG.getValueType(MVT::i32)); |
| |
| // i1 arguments are zero-extended to i8 by the caller. Emit a |
| // hint to reflect this. |
| if (Ins[i].isOrigArg()) { |
| Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex()); |
| if (OrigArg->getType()->isIntegerTy(1)) { |
| if (!Ins[i].Flags.isZExt()) { |
| ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, |
| ArgValue.getValueType(), ArgValue); |
| } |
| } |
| } |
| |
| InVals.push_back(ArgValue); |
| } |
| } |
| assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); |
| |
| // varargs |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| if (isVarArg) { |
| if (!Subtarget->isTargetDarwin() || IsWin64) { |
| // The AAPCS variadic function ABI is identical to the non-variadic |
| // one. As a result there may be more arguments in registers and we should |
| // save them for future reference. |
| // Win64 variadic functions also pass arguments in registers, but all float |
| // arguments are passed in integer registers. |
| saveVarArgRegisters(CCInfo, DAG, DL, Chain); |
| } |
| |
| // This will point to the next argument passed via stack. |
| unsigned StackOffset = CCInfo.getNextStackOffset(); |
| // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 |
| StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); |
| FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); |
| |
| if (MFI.hasMustTailInVarArgFunc()) { |
| SmallVector<MVT, 2> RegParmTypes; |
| RegParmTypes.push_back(MVT::i64); |
| RegParmTypes.push_back(MVT::f128); |
| // Compute the set of forwarded registers. The rest are scratch. |
| SmallVectorImpl<ForwardedRegister> &Forwards = |
| FuncInfo->getForwardedMustTailRegParms(); |
| CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, |
| CC_AArch64_AAPCS); |
| |
| // Conservatively forward X8, since it might be used for aggregate return. |
| if (!CCInfo.isAllocated(AArch64::X8)) { |
| unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); |
| Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); |
| } |
| } |
| } |
| |
| // On Windows, InReg pointers must be returned, so record the pointer in a |
| // virtual register at the start of the function so it can be returned in the |
| // epilogue. |
| if (IsWin64) { |
| for (unsigned I = 0, E = Ins.size(); I != E; ++I) { |
| if (Ins[I].Flags.isInReg()) { |
| assert(!FuncInfo->getSRetReturnReg()); |
| |
| MVT PtrTy = getPointerTy(DAG.getDataLayout()); |
| Register Reg = |
| MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); |
| FuncInfo->setSRetReturnReg(Reg); |
| |
| SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); |
| break; |
| } |
| } |
| } |
| |
| unsigned StackArgSize = CCInfo.getNextStackOffset(); |
| bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
| if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { |
| // This is a non-standard ABI so by fiat I say we're allowed to make full |
| // use of the stack area to be popped, which must be aligned to 16 bytes in |
| // any case: |
| StackArgSize = alignTo(StackArgSize, 16); |
| |
| // If we're expected to restore the stack (e.g. fastcc) then we'll be adding |
| // a multiple of 16. |
| FuncInfo->setArgumentStackToRestore(StackArgSize); |
| |
| // This realignment carries over to the available bytes below. Our own |
| // callers will guarantee the space is free by giving an aligned value to |
| // CALLSEQ_START. |
| } |
| // Even if we're not expected to free up the space, it's useful to know how |
| // much is there while considering tail calls (because we can reuse it). |
| FuncInfo->setBytesInStackArgArea(StackArgSize); |
| |
| if (Subtarget->hasCustomCallingConv()) |
| Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); |
| |
| return Chain; |
| } |
| |
| void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, |
| SelectionDAG &DAG, |
| const SDLoc &DL, |
| SDValue &Chain) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); |
| |
| SmallVector<SDValue, 8> MemOps; |
| |
| static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, |
| AArch64::X3, AArch64::X4, AArch64::X5, |
| AArch64::X6, AArch64::X7 }; |
| static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); |
| unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); |
| |
| unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); |
| int GPRIdx = 0; |
| if (GPRSaveSize != 0) { |
| if (IsWin64) { |
| GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); |
| if (GPRSaveSize & 15) |
| // The extra size here, if triggered, will always be 8. |
| MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); |
| } else |
| GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); |
| |
| SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); |
| |
| for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { |
| unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); |
| SDValue Store = |
| DAG.getStore(Val.getValue(1), DL, Val, FIN, |
| IsWin64 ? MachinePointerInfo::getFixedStack( |
| MF, GPRIdx, (i - FirstVariadicGPR) * 8) |
| : MachinePointerInfo::getStack(MF, i * 8)); |
| MemOps.push_back(Store); |
| FIN = |
| DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); |
| } |
| } |
| FuncInfo->setVarArgsGPRIndex(GPRIdx); |
| FuncInfo->setVarArgsGPRSize(GPRSaveSize); |
| |
| if (Subtarget->hasFPARMv8() && !IsWin64) { |
| static const MCPhysReg FPRArgRegs[] = { |
| AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, |
| AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; |
| static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); |
| unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); |
| |
| unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); |
| int FPRIdx = 0; |
| if (FPRSaveSize != 0) { |
| FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); |
| |
| SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); |
| |
| for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { |
| unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); |
| |
| SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, |
| MachinePointerInfo::getStack(MF, i * 16)); |
| MemOps.push_back(Store); |
| FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, |
| DAG.getConstant(16, DL, PtrVT)); |
| } |
| } |
| FuncInfo->setVarArgsFPRIndex(FPRIdx); |
| FuncInfo->setVarArgsFPRSize(FPRSaveSize); |
| } |
| |
| if (!MemOps.empty()) { |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); |
| } |
| } |
| |
| /// LowerCallResult - Lower the result values of a call into the |
| /// appropriate copies out of appropriate physical registers. |
| SDValue AArch64TargetLowering::LowerCallResult( |
| SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, |
| SDValue ThisVal) const { |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| // Assign locations to each value returned by this call. |
| SmallVector<CCValAssign, 16> RVLocs; |
| DenseMap<unsigned, SDValue> CopiedRegs; |
| CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, |
| *DAG.getContext()); |
| CCInfo.AnalyzeCallResult(Ins, RetCC); |
| |
| // Copy all of the result registers out of their specified physreg. |
| for (unsigned i = 0; i != RVLocs.size(); ++i) { |
| CCValAssign VA = RVLocs[i]; |
| |
| // Pass 'this' value directly from the argument to return value, to avoid |
| // reg unit interference |
| if (i == 0 && isThisReturn) { |
| assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && |
| "unexpected return calling convention register assignment"); |
| InVals.push_back(ThisVal); |
| continue; |
| } |
| |
| // Avoid copying a physreg twice since RegAllocFast is incompetent and only |
| // allows one use of a physreg per block. |
| SDValue Val = CopiedRegs.lookup(VA.getLocReg()); |
| if (!Val) { |
| Val = |
| DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); |
| Chain = Val.getValue(1); |
| InFlag = Val.getValue(2); |
| CopiedRegs[VA.getLocReg()] = Val; |
| } |
| |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::BCvt: |
| Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); |
| break; |
| case CCValAssign::AExtUpper: |
| Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| LLVM_FALLTHROUGH; |
| case CCValAssign::AExt: |
| LLVM_FALLTHROUGH; |
| case CCValAssign::ZExt: |
| Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); |
| break; |
| } |
| |
| InVals.push_back(Val); |
| } |
| |
| return Chain; |
| } |
| |
| /// Return true if the calling convention is one that we can guarantee TCO for. |
| static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { |
| return (CC == CallingConv::Fast && GuaranteeTailCalls) || |
| CC == CallingConv::Tail || CC == CallingConv::SwiftTail; |
| } |
| |
| /// Return true if we might ever do TCO for calls with this calling convention. |
| static bool mayTailCallThisCC(CallingConv::ID CC) { |
| switch (CC) { |
| case CallingConv::C: |
| case CallingConv::AArch64_SVE_VectorCall: |
| case CallingConv::PreserveMost: |
| case CallingConv::Swift: |
| case CallingConv::SwiftTail: |
| case CallingConv::Tail: |
| case CallingConv::Fast: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| bool AArch64TargetLowering::isEligibleForTailCallOptimization( |
| SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { |
| if (!mayTailCallThisCC(CalleeCC)) |
| return false; |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const Function &CallerF = MF.getFunction(); |
| CallingConv::ID CallerCC = CallerF.getCallingConv(); |
| |
| // Functions using the C or Fast calling convention that have an SVE signature |
| // preserve more registers and should assume the SVE_VectorCall CC. |
| // The check for matching callee-saved regs will determine whether it is |
| // eligible for TCO. |
| if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && |
| AArch64RegisterInfo::hasSVEArgsOrReturn(&MF)) |
| CallerCC = CallingConv::AArch64_SVE_VectorCall; |
| |
| bool CCMatch = CallerCC == CalleeCC; |
| |
| // When using the Windows calling convention on a non-windows OS, we want |
| // to back up and restore X18 in such functions; we can't do a tail call |
| // from those functions. |
| if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && |
| CalleeCC != CallingConv::Win64) |
| return false; |
| |
| // Byval parameters hand the function a pointer directly into the stack area |
| // we want to reuse during a tail call. Working around this *is* possible (see |
| // X86) but less efficient and uglier in LowerCall. |
| for (Function::const_arg_iterator i = CallerF.arg_begin(), |
| e = CallerF.arg_end(); |
| i != e; ++i) { |
| if (i->hasByValAttr()) |
| return false; |
| |
| // On Windows, "inreg" attributes signify non-aggregate indirect returns. |
| // In this case, it is necessary to save/restore X0 in the callee. Tail |
| // call opt interferes with this. So we disable tail call opt when the |
| // caller has an argument with "inreg" attribute. |
| |
| // FIXME: Check whether the callee also has an "inreg" argument. |
| if (i->hasInRegAttr()) |
| return false; |
| } |
| |
| if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) |
| return CCMatch; |
| |
| // Externally-defined functions with weak linkage should not be |
| // tail-called on AArch64 when the OS does not support dynamic |
| // pre-emption of symbols, as the AAELF spec requires normal calls |
| // to undefined weak functions to be replaced with a NOP or jump to the |
| // next instruction. The behaviour of branch instructions in this |
| // situation (as used for tail calls) is implementation-defined, so we |
| // cannot rely on the linker replacing the tail call with a return. |
| if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { |
| const GlobalValue *GV = G->getGlobal(); |
| const Triple &TT = getTargetMachine().getTargetTriple(); |
| if (GV->hasExternalWeakLinkage() && |
| (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) |
| return false; |
| } |
| |
| // Now we search for cases where we can use a tail call without changing the |
| // ABI. Sibcall is used in some places (particularly gcc) to refer to this |
| // concept. |
| |
| // I want anyone implementing a new calling convention to think long and hard |
| // about this assert. |
| assert((!isVarArg || CalleeCC == CallingConv::C) && |
| "Unexpected variadic calling convention"); |
| |
| LLVMContext &C = *DAG.getContext(); |
| if (isVarArg && !Outs.empty()) { |
| // At least two cases here: if caller is fastcc then we can't have any |
| // memory arguments (we'd be expected to clean up the stack afterwards). If |
| // caller is C then we could potentially use its argument area. |
| |
| // FIXME: for now we take the most conservative of these in both cases: |
| // disallow all variadic memory operands. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); |
| |
| CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); |
| for (const CCValAssign &ArgLoc : ArgLocs) |
| if (!ArgLoc.isRegLoc()) |
| return false; |
| } |
| |
| // Check that the call results are passed in the same way. |
| if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, |
| CCAssignFnForCall(CalleeCC, isVarArg), |
| CCAssignFnForCall(CallerCC, isVarArg))) |
| return false; |
| // The callee has to preserve all registers the caller needs to preserve. |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); |
| if (!CCMatch) { |
| const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); |
| if (Subtarget->hasCustomCallingConv()) { |
| TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); |
| TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); |
| } |
| if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) |
| return false; |
| } |
| |
| // Nothing more to check if the callee is taking no arguments |
| if (Outs.empty()) |
| return true; |
| |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); |
| |
| CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); |
| |
| const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| // If any of the arguments is passed indirectly, it must be SVE, so the |
| // 'getBytesInStackArgArea' is not sufficient to determine whether we need to |
| // allocate space on the stack. That is why we determine this explicitly here |
| // the call cannot be a tailcall. |
| if (llvm::any_of(ArgLocs, [](CCValAssign &A) { |
| assert((A.getLocInfo() != CCValAssign::Indirect || |
| A.getValVT().isScalableVector()) && |
| "Expected value to be scalable"); |
| return A.getLocInfo() == CCValAssign::Indirect; |
| })) |
| return false; |
| |
| // If the stack arguments for this call do not fit into our own save area then |
| // the call cannot be made tail. |
| if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) |
| return false; |
| |
| const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) |
| return false; |
| |
| return true; |
| } |
| |
| SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, |
| SelectionDAG &DAG, |
| MachineFrameInfo &MFI, |
| int ClobberedFI) const { |
| SmallVector<SDValue, 8> ArgChains; |
| int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); |
| int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; |
| |
| // Include the original chain at the beginning of the list. When this is |
| // used by target LowerCall hooks, this helps legalize find the |
| // CALLSEQ_BEGIN node. |
| ArgChains.push_back(Chain); |
| |
| // Add a chain value for each stack argument corresponding |
| for (SDNode *U : DAG.getEntryNode().getNode()->uses()) |
| if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) |
| if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) |
| if (FI->getIndex() < 0) { |
| int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); |
| int64_t InLastByte = InFirstByte; |
| InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; |
| |
| if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || |
| (FirstByte <= InFirstByte && InFirstByte <= LastByte)) |
| ArgChains.push_back(SDValue(L, 1)); |
| } |
| |
| // Build a tokenfactor for all the chains. |
| return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); |
| } |
| |
| bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, |
| bool TailCallOpt) const { |
| return (CallCC == CallingConv::Fast && TailCallOpt) || |
| CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; |
| } |
| |
| // Check if the value is zero-extended from i1 to i8 |
| static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { |
| unsigned SizeInBits = Arg.getValueType().getSizeInBits(); |
| if (SizeInBits < 8) |
| return false; |
| |
| APInt LowBits(SizeInBits, 0xFF); |
| APInt RequredZero(SizeInBits, 0xFE); |
| KnownBits Bits = DAG.computeKnownBits(Arg, LowBits, 4); |
| bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; |
| return ZExtBool; |
| } |
| |
| /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, |
| /// and add input and output parameter nodes. |
| SDValue |
| AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, |
| SmallVectorImpl<SDValue> &InVals) const { |
| SelectionDAG &DAG = CLI.DAG; |
| SDLoc &DL = CLI.DL; |
| SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
| SmallVector<SDValue, 32> &OutVals = CLI.OutVals; |
| SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; |
| SDValue Chain = CLI.Chain; |
| SDValue Callee = CLI.Callee; |
| bool &IsTailCall = CLI.IsTailCall; |
| CallingConv::ID CallConv = CLI.CallConv; |
| bool IsVarArg = CLI.IsVarArg; |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFunction::CallSiteInfo CSInfo; |
| bool IsThisReturn = false; |
| |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
| bool IsSibCall = false; |
| bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv); |
| |
| // Check callee args/returns for SVE registers and set calling convention |
| // accordingly. |
| if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { |
| bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ |
| return Out.VT.isScalableVector(); |
| }); |
| bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ |
| return In.VT.isScalableVector(); |
| }); |
| |
| if (CalleeInSVE || CalleeOutSVE) |
| CallConv = CallingConv::AArch64_SVE_VectorCall; |
| } |
| |
| if (IsTailCall) { |
| // Check if it's really possible to do a tail call. |
| IsTailCall = isEligibleForTailCallOptimization( |
| Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); |
| |
| // A sibling call is one where we're under the usual C ABI and not planning |
| // to change that but can still do a tail call: |
| if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && |
| CallConv != CallingConv::SwiftTail) |
| IsSibCall = true; |
| |
| if (IsTailCall) |
| ++NumTailCalls; |
| } |
| |
| if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) |
| report_fatal_error("failed to perform tail call elimination on a call " |
| "site marked musttail"); |
| |
| // Analyze operands of the call, assigning locations to each operand. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| if (IsVarArg) { |
| // Handle fixed and variable vector arguments differently. |
| // Variable vector arguments always go into memory. |
| unsigned NumArgs = Outs.size(); |
| |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ArgVT = Outs[i].VT; |
| if (!Outs[i].IsFixed && ArgVT.isScalableVector()) |
| report_fatal_error("Passing SVE types to variadic functions is " |
| "currently not supported"); |
| |
| ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; |
| bool UseVarArgCC = !Outs[i].IsFixed; |
| // On Windows, the fixed arguments in a vararg call are passed in GPRs |
| // too, so use the vararg CC to force them to integer registers. |
| if (IsCalleeWin64) |
| UseVarArgCC = true; |
| CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); |
| bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); |
| assert(!Res && "Call operand has unhandled type"); |
| (void)Res; |
| } |
| } else { |
| // At this point, Outs[].VT may already be promoted to i32. To correctly |
| // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and |
| // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. |
| // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here |
| // we use a special version of AnalyzeCallOperands to pass in ValVT and |
| // LocVT. |
| unsigned NumArgs = Outs.size(); |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ValVT = Outs[i].VT; |
| // Get type of the original argument. |
| EVT ActualVT = getValueType(DAG.getDataLayout(), |
| CLI.getArgs()[Outs[i].OrigArgIndex].Ty, |
| /*AllowUnknown*/ true); |
| MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; |
| ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; |
| // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. |
| if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) |
| ValVT = MVT::i8; |
| else if (ActualMVT == MVT::i16) |
| ValVT = MVT::i16; |
| |
| CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); |
| bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); |
| assert(!Res && "Call operand has unhandled type"); |
| (void)Res; |
| } |
| } |
| |
| // Get a count of how many bytes are to be pushed on the stack. |
| unsigned NumBytes = CCInfo.getNextStackOffset(); |
| |
| if (IsSibCall) { |
| // Since we're not changing the ABI to make this a tail call, the memory |
| // operands are already available in the caller's incoming argument space. |
| NumBytes = 0; |
| } |
| |
| // FPDiff is the byte offset of the call's argument area from the callee's. |
| // Stores to callee stack arguments will be placed in FixedStackSlots offset |
| // by this amount for a tail call. In a sibling call it must be 0 because the |
| // caller will deallocate the entire stack and the callee still expects its |
| // arguments to begin at SP+0. Completely unused for non-tail calls. |
| int FPDiff = 0; |
| |
| if (IsTailCall && !IsSibCall) { |
| unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); |
| |
| // Since callee will pop argument stack as a tail call, we must keep the |
| // popped size 16-byte aligned. |
| NumBytes = alignTo(NumBytes, 16); |
| |
| // FPDiff will be negative if this tail call requires more space than we |
| // would automatically have in our incoming argument space. Positive if we |
| // can actually shrink the stack. |
| FPDiff = NumReusableBytes - NumBytes; |
| |
| // Update the required reserved area if this is the tail call requiring the |
| // most argument stack space. |
| if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) |
| FuncInfo->setTailCallReservedStack(-FPDiff); |
| |
| // The stack pointer must be 16-byte aligned at all times it's used for a |
| // memory operation, which in practice means at *all* times and in |
| // particular across call boundaries. Therefore our own arguments started at |
| // a 16-byte aligned SP and the delta applied for the tail call should |
| // satisfy the same constraint. |
| assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); |
| } |
| |
| // Adjust the stack pointer for the new arguments... |
| // These operations are automatically eliminated by the prolog/epilog pass |
| if (!IsSibCall) |
| Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); |
| |
| SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, |
| getPointerTy(DAG.getDataLayout())); |
| |
| SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
| SmallSet<unsigned, 8> RegsUsed; |
| SmallVector<SDValue, 8> MemOpChains; |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| |
| if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { |
| const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); |
| for (const auto &F : Forwards) { |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); |
| RegsToPass.emplace_back(F.PReg, Val); |
| } |
| } |
| |
| // Walk the register/memloc assignments, inserting copies/loads. |
| unsigned ExtraArgLocs = 0; |
| for (unsigned i = 0, e = Outs.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; |
| SDValue Arg = OutVals[i]; |
| ISD::ArgFlagsTy Flags = Outs[i].Flags; |
| |
| // Promote the value if needed. |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::SExt: |
| Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::ZExt: |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExt: |
| if (Outs[i].ArgVT == MVT::i1) { |
| // AAPCS requires i1 to be zero-extended to 8-bits by the caller. |
| // |
| // Check if we actually have to do this, because the value may |
| // already be zero-extended. |
| // |
| // We cannot just emit a (zext i8 (trunc (assert-zext i8))) |
| // and rely on DAGCombiner to fold this, because the following |
| // (anyext i32) is combined with (zext i8) in DAG.getNode: |
| // |
| // (ext (zext x)) -> (zext x) |
| // |
| // This will give us (zext i32), which we cannot remove, so |
| // try to check this beforehand. |
| if (!checkZExtBool(Arg, DAG)) { |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); |
| } |
| } |
| Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExtUpper: |
| assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); |
| Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); |
| Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| break; |
| case CCValAssign::BCvt: |
| Arg = DAG.getBitcast(VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::Trunc: |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| break; |
| case CCValAssign::FPExt: |
| Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::Indirect: |
| assert(VA.getValVT().isScalableVector() && |
| "Only scalable vectors can be passed indirectly"); |
| |
| uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); |
| uint64_t PartSize = StoreSize; |
| unsigned NumParts = 1; |
| if (Outs[i].Flags.isInConsecutiveRegs()) { |
| assert(!Outs[i].Flags.isInConsecutiveRegsLast()); |
| while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) |
| ++NumParts; |
| StoreSize *= NumParts; |
| } |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); |
| Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); |
| int FI = MFI.CreateStackObject(StoreSize, Alignment, false); |
| MFI.setStackID(FI, TargetStackID::ScalableVector); |
| |
| MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); |
| SDValue Ptr = DAG.getFrameIndex( |
| FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); |
| SDValue SpillSlot = Ptr; |
| |
| // Ensure we generate all stores for each tuple part, whilst updating the |
| // pointer after each store correctly using vscale. |
| while (NumParts) { |
| Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); |
| NumParts--; |
| if (NumParts > 0) { |
| SDValue BytesIncrement = DAG.getVScale( |
| DL, Ptr.getValueType(), |
| APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); |
| SDNodeFlags Flags; |
| Flags.setNoUnsignedWrap(true); |
| |
| MPI = MachinePointerInfo(MPI.getAddrSpace()); |
| Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, |
| BytesIncrement, Flags); |
| ExtraArgLocs++; |
| i++; |
| } |
| } |
| |
| Arg = SpillSlot; |
| break; |
| } |
| |
| if (VA.isRegLoc()) { |
| if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && |
| Outs[0].VT == MVT::i64) { |
| assert(VA.getLocVT() == MVT::i64 && |
| "unexpected calling convention register assignment"); |
| assert(!Ins.empty() && Ins[0].VT == MVT::i64 && |
| "unexpected use of 'returned'"); |
| IsThisReturn = true; |
| } |
| if (RegsUsed.count(VA.getLocReg())) { |
| // If this register has already been used then we're trying to pack |
| // parts of an [N x i32] into an X-register. The extension type will |
| // take care of putting the two halves in the right place but we have to |
| // combine them. |
| SDValue &Bits = |
| llvm::find_if(RegsToPass, |
| [=](const std::pair<unsigned, SDValue> &Elt) { |
| return Elt.first == VA.getLocReg(); |
| }) |
| ->second; |
| Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); |
| // Call site info is used for function's parameter entry value |
| // tracking. For now we track only simple cases when parameter |
| // is transferred through whole register. |
| llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { |
| return ArgReg.Reg == VA.getLocReg(); |
| }); |
| } else { |
| RegsToPass.emplace_back(VA.getLocReg(), Arg); |
| RegsUsed.insert(VA.getLocReg()); |
| const TargetOptions &Options = DAG.getTarget().Options; |
| if (Options.EmitCallSiteInfo) |
| CSInfo.emplace_back(VA.getLocReg(), i); |
| } |
| } else { |
| assert(VA.isMemLoc()); |
| |
| SDValue DstAddr; |
| MachinePointerInfo DstInfo; |
| |
| // FIXME: This works on big-endian for composite byvals, which are the |
| // common case. It should also work for fundamental types too. |
| uint32_t BEAlign = 0; |
| unsigned OpSize; |
| if (VA.getLocInfo() == CCValAssign::Indirect || |
| VA.getLocInfo() == CCValAssign::Trunc) |
| OpSize = VA.getLocVT().getFixedSizeInBits(); |
| else |
| OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 |
| : VA.getValVT().getSizeInBits(); |
| OpSize = (OpSize + 7) / 8; |
| if (!Subtarget->isLittleEndian() && !Flags.isByVal() && |
| !Flags.isInConsecutiveRegs()) { |
| if (OpSize < 8) |
| BEAlign = 8 - OpSize; |
| } |
| unsigned LocMemOffset = VA.getLocMemOffset(); |
| int32_t Offset = LocMemOffset + BEAlign; |
| SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); |
| PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); |
| |
| if (IsTailCall) { |
| Offset = Offset + FPDiff; |
| int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); |
| |
| DstAddr = DAG.getFrameIndex(FI, PtrVT); |
| DstInfo = MachinePointerInfo::getFixedStack(MF, FI); |
| |
| // Make sure any stack arguments overlapping with where we're storing |
| // are loaded before this eventual operation. Otherwise they'll be |
| // clobbered. |
| Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); |
| } else { |
| SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); |
| |
| DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); |
| DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); |
| } |
| |
| if (Outs[i].Flags.isByVal()) { |
| SDValue SizeNode = |
| DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); |
| SDValue Cpy = DAG.getMemcpy( |
| Chain, DL, DstAddr, Arg, SizeNode, |
| Outs[i].Flags.getNonZeroByValAlign(), |
| /*isVol = */ false, /*AlwaysInline = */ false, |
| /*isTailCall = */ false, DstInfo, MachinePointerInfo()); |
| |
| MemOpChains.push_back(Cpy); |
| } else { |
| // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already |
| // promoted to a legal register type i32, we should truncate Arg back to |
| // i1/i8/i16. |
| if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || |
| VA.getValVT() == MVT::i16) |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); |
| |
| SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); |
| MemOpChains.push_back(Store); |
| } |
| } |
| } |
| |
| if (!MemOpChains.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); |
| |
| // Build a sequence of copy-to-reg nodes chained together with token chain |
| // and flag operands which copy the outgoing args into the appropriate regs. |
| SDValue InFlag; |
| for (auto &RegToPass : RegsToPass) { |
| Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, |
| RegToPass.second, InFlag); |
| InFlag = Chain.getValue(1); |
| } |
| |
| // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every |
| // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol |
| // node so that legalize doesn't hack it. |
| if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { |
| auto GV = G->getGlobal(); |
| unsigned OpFlags = |
| Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); |
| if (OpFlags & AArch64II::MO_GOT) { |
| Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); |
| Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); |
| } else { |
| const GlobalValue *GV = G->getGlobal(); |
| Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); |
| } |
| } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| Subtarget->isTargetMachO()) { |
| const char *Sym = S->getSymbol(); |
| Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); |
| Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); |
| } else { |
| const char *Sym = S->getSymbol(); |
| Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); |
| } |
| } |
| |
| // We don't usually want to end the call-sequence here because we would tidy |
| // the frame up *after* the call, however in the ABI-changing tail-call case |
| // we've carefully laid out the parameters so that when sp is reset they'll be |
| // in the correct location. |
| if (IsTailCall && !IsSibCall) { |
| Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), |
| DAG.getIntPtrConstant(0, DL, true), InFlag, DL); |
| InFlag = Chain.getValue(1); |
| } |
| |
| std::vector<SDValue> Ops; |
| Ops.push_back(Chain); |
| Ops.push_back(Callee); |
| |
| if (IsTailCall) { |
| // Each tail call may have to adjust the stack by a different amount, so |
| // this information must travel along with the operation for eventual |
| // consumption by emitEpilogue. |
| Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); |
| } |
| |
| // Add argument registers to the end of the list so that they are known live |
| // into the call. |
| for (auto &RegToPass : RegsToPass) |
| Ops.push_back(DAG.getRegister(RegToPass.first, |
| RegToPass.second.getValueType())); |
| |
| // Add a register mask operand representing the call-preserved registers. |
| const uint32_t *Mask; |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| if (IsThisReturn) { |
| // For 'this' returns, use the X0-preserving mask if applicable |
| Mask = TRI->getThisReturnPreservedMask(MF, CallConv); |
| if (!Mask) { |
| IsThisReturn = false; |
| Mask = TRI->getCallPreservedMask(MF, CallConv); |
| } |
| } else |
| Mask = TRI->getCallPreservedMask(MF, CallConv); |
| |
| if (Subtarget->hasCustomCallingConv()) |
| TRI->UpdateCustomCallPreservedMask(MF, &Mask); |
| |
| if (TRI->isAnyArgRegReserved(MF)) |
| TRI->emitReservedArgRegCallError(MF); |
| |
| assert(Mask && "Missing call preserved mask for calling convention"); |
| Ops.push_back(DAG.getRegisterMask(Mask)); |
| |
| if (InFlag.getNode()) |
| Ops.push_back(InFlag); |
| |
| SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
| |
| // If we're doing a tall call, use a TC_RETURN here rather than an |
| // actual call instruction. |
| if (IsTailCall) { |
| MF.getFrameInfo().setHasTailCall(); |
| SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); |
| DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); |
| return Ret; |
| } |
| |
| unsigned CallOpc = AArch64ISD::CALL; |
| // Calls with operand bundle "clang.arc.attachedcall" are special. They should |
| // be expanded to the call, directly followed by a special marker sequence. |
| // Use the CALL_RVMARKER to do that. |
| if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { |
| assert(!IsTailCall && |
| "tail calls cannot be marked with clang.arc.attachedcall"); |
| CallOpc = AArch64ISD::CALL_RVMARKER; |
| } |
| |
| // Returns a chain and a flag for retval copy to use. |
| Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); |
| DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); |
| InFlag = Chain.getValue(1); |
| DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); |
| |
| uint64_t CalleePopBytes = |
| DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; |
| |
| Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), |
| DAG.getIntPtrConstant(CalleePopBytes, DL, true), |
| InFlag, DL); |
| if (!Ins.empty()) |
| InFlag = Chain.getValue(1); |
| |
| // Handle result values, copying them out of physregs into vregs that we |
| // return. |
| return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, |
| InVals, IsThisReturn, |
| IsThisReturn ? OutVals[0] : SDValue()); |
| } |
| |
| bool AArch64TargetLowering::CanLowerReturn( |
| CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); |
| return CCInfo.CheckReturn(Outs, RetCC); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
| bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SDLoc &DL, SelectionDAG &DAG) const { |
| auto &MF = DAG.getMachineFunction(); |
| auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| |
| CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); |
| CCInfo.AnalyzeReturn(Outs, RetCC); |
| |
| // Copy the result values into the output registers. |
| SDValue Flag; |
| SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; |
| SmallSet<unsigned, 4> RegsUsed; |
| for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); |
| ++i, ++realRVLocIdx) { |
| CCValAssign &VA = RVLocs[i]; |
| assert(VA.isRegLoc() && "Can only return in registers!"); |
| SDValue Arg = OutVals[realRVLocIdx]; |
| |
| switch (VA.getLocInfo()) { |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: |
| if (Outs[i].ArgVT == MVT::i1) { |
| // AAPCS requires i1 to be zero-extended to i8 by the producer of the |
| // value. This is strictly redundant on Darwin (which uses "zeroext |
| // i1"), but will be optimised out before ISel. |
| Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); |
| } |
| break; |
| case CCValAssign::BCvt: |
| Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExt: |
| case CCValAssign::ZExt: |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| break; |
| case CCValAssign::AExtUpper: |
| assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); |
| Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); |
| Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, |
| DAG.getConstant(32, DL, VA.getLocVT())); |
| break; |
| } |
| |
| if (RegsUsed.count(VA.getLocReg())) { |
| SDValue &Bits = |
| llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { |
| return Elt.first == VA.getLocReg(); |
| })->second; |
| Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); |
| } else { |
| RetVals.emplace_back(VA.getLocReg(), Arg); |
| RegsUsed.insert(VA.getLocReg()); |
| } |
| } |
| |
| SmallVector<SDValue, 4> RetOps(1, Chain); |
| for (auto &RetVal : RetVals) { |
| Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); |
| Flag = Chain.getValue(1); |
| RetOps.push_back( |
| DAG.getRegister(RetVal.first, RetVal.second.getValueType())); |
| } |
| |
| // Windows AArch64 ABIs require that for returning structs by value we copy |
| // the sret argument into X0 for the return. |
| // We saved the argument into a virtual register in the entry block, |
| // so now we copy the value out and into X0. |
| if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { |
| SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, |
| getPointerTy(MF.getDataLayout())); |
| |
| unsigned RetValReg = AArch64::X0; |
| Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); |
| Flag = Chain.getValue(1); |
| |
| RetOps.push_back( |
| DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); |
| } |
| |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); |
| if (I) { |
| for (; *I; ++I) { |
| if (AArch64::GPR64RegClass.contains(*I)) |
| RetOps.push_back(DAG.getRegister(*I, MVT::i64)); |
| else if (AArch64::FPR64RegClass.contains(*I)) |
| RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); |
| else |
| llvm_unreachable("Unexpected register class in CSRsViaCopy!"); |
| } |
| } |
| |
| RetOps[0] = Chain; // Update chain. |
| |
| // Add the flag if we have it. |
| if (Flag.getNode()) |
| RetOps.push_back(Flag); |
| |
| return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Other Lowering Code |
| //===----------------------------------------------------------------------===// |
| |
| SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, |
| N->getOffset(), Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), |
| N->getOffset(), Flag); |
| } |
| |
| SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, |
| SelectionDAG &DAG, |
| unsigned Flag) const { |
| return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); |
| } |
| |
| // (loadGOT sym) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); |
| // FIXME: Once remat is capable of dealing with instructions with register |
| // operands, expand this into two nodes instead of using a wrapper node. |
| return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); |
| } |
| |
| // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| const unsigned char MO_NC = AArch64II::MO_NC; |
| return DAG.getNode( |
| AArch64ISD::WrapperLarge, DL, Ty, |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), |
| getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); |
| } |
| |
| // (addlow (adrp %hi(sym)) %lo(sym)) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); |
| SDValue Lo = getTargetNode(N, Ty, DAG, |
| AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); |
| SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); |
| return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); |
| } |
| |
| // (adr sym) |
| template <class NodeTy> |
| SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, |
| unsigned Flags) const { |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); |
| SDLoc DL(N); |
| EVT Ty = getPointerTy(DAG.getDataLayout()); |
| SDValue Sym = getTargetNode(N, Ty, DAG, Flags); |
| return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); |
| } |
| |
| SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); |
| const GlobalValue *GV = GN->getGlobal(); |
| unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); |
| |
| if (OpFlags != AArch64II::MO_NO_FLAG) |
| assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && |
| "unexpected offset in global node"); |
| |
| // This also catches the large code model case for Darwin, and tiny code |
| // model with got relocations. |
| if ((OpFlags & AArch64II::MO_GOT) != 0) { |
| return getGOT(GN, DAG, OpFlags); |
| } |
| |
| SDValue Result; |
| if (getTargetMachine().getCodeModel() == CodeModel::Large) { |
| Result = getAddrLarge(GN, DAG, OpFlags); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| Result = getAddrTiny(GN, DAG, OpFlags); |
| } else { |
| Result = getAddr(GN, DAG, OpFlags); |
| } |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(GN); |
| if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB)) |
| Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, |
| MachinePointerInfo::getGOT(DAG.getMachineFunction())); |
| return Result; |
| } |
| |
| /// Convert a TLS address reference into the correct sequence of loads |
| /// and calls to compute the variable's address (for Darwin, currently) and |
| /// return an SDValue containing the final node. |
| |
| /// Darwin only has one TLS scheme which must be capable of dealing with the |
| /// fully general situation, in the worst case. This means: |
| /// + "extern __thread" declaration. |
| /// + Defined in a possibly unknown dynamic library. |
| /// |
| /// The general system is that each __thread variable has a [3 x i64] descriptor |
| /// which contains information used by the runtime to calculate the address. The |
| /// only part of this the compiler needs to know about is the first xword, which |
| /// contains a function pointer that must be called with the address of the |
| /// entire descriptor in "x0". |
| /// |
| /// Since this descriptor may be in a different unit, in general even the |
| /// descriptor must be accessed via an indirect load. The "ideal" code sequence |
| /// is: |
| /// adrp x0, _var@TLVPPAGE |
| /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor |
| /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, |
| /// ; the function pointer |
| /// blr x1 ; Uses descriptor address in x0 |
| /// ; Address of _var is now in x0. |
| /// |
| /// If the address of _var's descriptor *is* known to the linker, then it can |
| /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for |
| /// a slight efficiency gain. |
| SDValue |
| AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetDarwin() && |
| "This function expects a Darwin target"); |
| |
| SDLoc DL(Op); |
| MVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); |
| |
| SDValue TLVPAddr = |
| DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); |
| |
| // The first entry in the descriptor is a function pointer that we must call |
| // to obtain the address of the variable. |
| SDValue Chain = DAG.getEntryNode(); |
| SDValue FuncTLVGet = DAG.getLoad( |
| PtrMemVT, DL, Chain, DescAddr, |
| MachinePointerInfo::getGOT(DAG.getMachineFunction()), |
| Align(PtrMemVT.getSizeInBits() / 8), |
| MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); |
| Chain = FuncTLVGet.getValue(1); |
| |
| // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. |
| FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); |
| |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| MFI.setAdjustsStack(true); |
| |
| // TLS calls preserve all registers except those that absolutely must be |
| // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be |
| // silly). |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const uint32_t *Mask = TRI->getTLSCallPreservedMask(); |
| if (Subtarget->hasCustomCallingConv()) |
| TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); |
| |
| // Finally, we can make the call. This is just a degenerate version of a |
| // normal AArch64 call node: x0 takes the address of the descriptor, and |
| // returns the address of the variable in this thread. |
| Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); |
| Chain = |
| DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), |
| Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), |
| DAG.getRegisterMask(Mask), Chain.getValue(1)); |
| return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); |
| } |
| |
| /// Convert a thread-local variable reference into a sequence of instructions to |
| /// compute the variable's address for the local exec TLS model of ELF targets. |
| /// The sequence depends on the maximum TLS area size. |
| SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, |
| SDValue ThreadBase, |
| const SDLoc &DL, |
| SelectionDAG &DAG) const { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDValue TPOff, Addr; |
| |
| switch (DAG.getTarget().Options.TLSSize) { |
| default: |
| llvm_unreachable("Unexpected TLS size"); |
| |
| case 12: { |
| // mrs x0, TPIDR_EL0 |
| // add x0, x0, :tprel_lo12:a |
| SDValue Var = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); |
| return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, |
| Var, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } |
| |
| case 24: { |
| // mrs x0, TPIDR_EL0 |
| // add x0, x0, :tprel_hi12:a |
| // add x0, x0, :tprel_lo12_nc:a |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, |
| HiVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, |
| LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } |
| |
| case 32: { |
| // mrs x1, TPIDR_EL0 |
| // movz x0, #:tprel_g1:a |
| // movk x0, #:tprel_g0_nc:a |
| // add x0, x1, x0 |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, |
| DAG.getTargetConstant(16, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| |
| case 48: { |
| // mrs x1, TPIDR_EL0 |
| // movz x0, #:tprel_g2:a |
| // movk x0, #:tprel_g1_nc:a |
| // movk x0, #:tprel_g0_nc:a |
| // add x0, x1, x0 |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); |
| SDValue MiVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, |
| DAG.getTargetConstant(32, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, |
| DAG.getTargetConstant(16, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| } |
| } |
| |
| /// When accessing thread-local variables under either the general-dynamic or |
| /// local-dynamic system, we make a "TLS-descriptor" call. The variable will |
| /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry |
| /// is a function pointer to carry out the resolution. |
| /// |
| /// The sequence is: |
| /// adrp x0, :tlsdesc:var |
| /// ldr x1, [x0, #:tlsdesc_lo12:var] |
| /// add x0, x0, #:tlsdesc_lo12:var |
| /// .tlsdesccall var |
| /// blr x1 |
| /// (TPIDR_EL0 offset now in x0) |
| /// |
| /// The above sequence must be produced unscheduled, to enable the linker to |
| /// optimize/relax this sequence. |
| /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the |
| /// above sequence, and expanded really late in the compilation flow, to ensure |
| /// the sequence is produced as per above. |
| SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, |
| const SDLoc &DL, |
| SelectionDAG &DAG) const { |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| |
| SDValue Chain = DAG.getEntryNode(); |
| SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
| |
| Chain = |
| DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); |
| SDValue Glue = Chain.getValue(1); |
| |
| return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetELF() && "This function expects an ELF target"); |
| |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| |
| TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); |
| |
| if (!EnableAArch64ELFLocalDynamicTLSGeneration) { |
| if (Model == TLSModel::LocalDynamic) |
| Model = TLSModel::GeneralDynamic; |
| } |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| Model != TLSModel::LocalExec) |
| report_fatal_error("ELF TLS only supported in small memory model or " |
| "in local exec TLS model"); |
| // Different choices can be made for the maximum size of the TLS area for a |
| // module. For the small address model, the default TLS size is 16MiB and the |
| // maximum TLS size is 4GiB. |
| // FIXME: add tiny and large code model support for TLS access models other |
| // than local exec. We currently generate the same code as small for tiny, |
| // which may be larger than needed. |
| |
| SDValue TPOff; |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| const GlobalValue *GV = GA->getGlobal(); |
| |
| SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); |
| |
| if (Model == TLSModel::LocalExec) { |
| return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); |
| } else if (Model == TLSModel::InitialExec) { |
| TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); |
| } else if (Model == TLSModel::LocalDynamic) { |
| // Local-dynamic accesses proceed in two phases. A general-dynamic TLS |
| // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate |
| // the beginning of the module's TLS region, followed by a DTPREL offset |
| // calculation. |
| |
| // These accesses will need deduplicating if there's more than one. |
| AArch64FunctionInfo *MFI = |
| DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| MFI->incNumLocalDynamicTLSAccesses(); |
| |
| // The call needs a relocation too for linker relaxation. It doesn't make |
| // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of |
| // the address. |
| SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, |
| AArch64II::MO_TLS); |
| |
| // Now we can calculate the offset from TPIDR_EL0 to this module's |
| // thread-local area. |
| TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); |
| |
| // Now use :dtprel_whatever: operations to calculate this variable's offset |
| // in its thread-storage area. |
| SDValue HiVar = DAG.getTargetGlobalAddress( |
| GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue LoVar = DAG.getTargetGlobalAddress( |
| GV, DL, MVT::i64, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| |
| TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| } else if (Model == TLSModel::GeneralDynamic) { |
| // The call needs a relocation too for linker relaxation. It doesn't make |
| // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of |
| // the address. |
| SDValue SymAddr = |
| DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); |
| |
| // Finally we can make a call to calculate the offset from tpidr_el0. |
| TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); |
| } else |
| llvm_unreachable("Unsupported ELF TLS access model"); |
| |
| return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); |
| |
| SDValue Chain = DAG.getEntryNode(); |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| |
| SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); |
| |
| // Load the ThreadLocalStoragePointer from the TEB |
| // A pointer to the TLS array is located at offset 0x58 from the TEB. |
| SDValue TLSArray = |
| DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); |
| TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); |
| Chain = TLSArray.getValue(1); |
| |
| // Load the TLS index from the C runtime; |
| // This does the same as getAddr(), but without having a GlobalAddressSDNode. |
| // This also does the same as LOADgot, but using a generic i32 load, |
| // while LOADgot only loads i64. |
| SDValue TLSIndexHi = |
| DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); |
| SDValue TLSIndexLo = DAG.getTargetExternalSymbol( |
| "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); |
| SDValue TLSIndex = |
| DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); |
| TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); |
| Chain = TLSIndex.getValue(1); |
| |
| // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 |
| // offset into the TLSArray. |
| TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); |
| SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, |
| DAG.getConstant(3, DL, PtrVT)); |
| SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, |
| DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), |
| MachinePointerInfo()); |
| Chain = TLS.getValue(1); |
| |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| const GlobalValue *GV = GA->getGlobal(); |
| SDValue TGAHi = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); |
| SDValue TGALo = DAG.getTargetGlobalAddress( |
| GV, DL, PtrVT, 0, |
| AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
| |
| // Add the offset from the start of the .tls section (section base). |
| SDValue Addr = |
| SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, |
| DAG.getTargetConstant(0, DL, MVT::i32)), |
| 0); |
| Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); |
| return Addr; |
| } |
| |
| SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); |
| if (DAG.getTarget().useEmulatedTLS()) |
| return LowerToTLSEmulatedModel(GA, DAG); |
| |
| if (Subtarget->isTargetDarwin()) |
| return LowerDarwinGlobalTLSAddress(Op, DAG); |
| if (Subtarget->isTargetELF()) |
| return LowerELFGlobalTLSAddress(Op, DAG); |
| if (Subtarget->isTargetWindows()) |
| return LowerWindowsGlobalTLSAddress(Op, DAG); |
| |
| llvm_unreachable("Unexpected platform trying to use TLS"); |
| } |
| |
| // Looks through \param Val to determine the bit that can be used to |
| // check the sign of the value. It returns the unextended value and |
| // the sign bit position. |
| std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { |
| if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) |
| return {Val.getOperand(0), |
| cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - |
| 1}; |
| |
| if (Val.getOpcode() == ISD::SIGN_EXTEND) |
| return {Val.getOperand(0), |
| Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; |
| |
| return {Val, Val.getValueSizeInBits() - 1}; |
| } |
| |
| SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { |
| SDValue Chain = Op.getOperand(0); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); |
| SDValue LHS = Op.getOperand(2); |
| SDValue RHS = Op.getOperand(3); |
| SDValue Dest = Op.getOperand(4); |
| SDLoc dl(Op); |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions |
| // will not be produced, as they are conditional branch instructions that do |
| // not set flags. |
| bool ProduceNonFlagSettingCondBr = |
| !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); |
| |
| // Handle f128 first, since lowering it will result in comparing the return |
| // value of a libcall against zero, which is just what the rest of LowerBR_CC |
| // is expecting to deal with. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); |
| |
| // If softenSetCCOperands returned a scalar, we need to compare the result |
| // against zero to select between true and false values. |
| if (!RHS.getNode()) { |
| RHS = DAG.getConstant(0, dl, LHS.getValueType()); |
| CC = ISD::SETNE; |
| } |
| } |
| |
| // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch |
| // instruction. |
| if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && |
| (CC == ISD::SETEQ || CC == ISD::SETNE)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) |
| return SDValue(); |
| |
| // The actual operation with overflow check. |
| AArch64CC::CondCode OFCC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); |
| |
| if (CC == ISD::SETNE) |
| OFCC = getInvertedCondCode(OFCC); |
| SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); |
| |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, |
| Overflow); |
| } |
| |
| if (LHS.getValueType().isInteger()) { |
| assert((LHS.getValueType() == RHS.getValueType()) && |
| (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); |
| |
| // If the RHS of the comparison is zero, we can potentially fold this |
| // to a specialized branch. |
| const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); |
| if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { |
| if (CC == ISD::SETEQ) { |
| // See if we can use a TBZ to fold in an AND as well. |
| // TBZ has a smaller branch displacement than CBZ. If the offset is |
| // out of bounds, a late MI-layer pass rewrites branches. |
| // 403.gcc is an example that hits this case. |
| if (LHS.getOpcode() == ISD::AND && |
| isa<ConstantSDNode>(LHS.getOperand(1)) && |
| isPowerOf2_64(LHS.getConstantOperandVal(1))) { |
| SDValue Test = LHS.getOperand(0); |
| uint64_t Mask = LHS.getConstantOperandVal(1); |
| return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, |
| DAG.getConstant(Log2_64(Mask), dl, MVT::i64), |
| Dest); |
| } |
| |
| return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); |
| } else if (CC == ISD::SETNE) { |
| // See if we can use a TBZ to fold in an AND as well. |
| // TBZ has a smaller branch displacement than CBZ. If the offset is |
| // out of bounds, a late MI-layer pass rewrites branches. |
| // 403.gcc is an example that hits this case. |
| if (LHS.getOpcode() == ISD::AND && |
| isa<ConstantSDNode>(LHS.getOperand(1)) && |
| isPowerOf2_64(LHS.getConstantOperandVal(1))) { |
| SDValue Test = LHS.getOperand(0); |
| uint64_t Mask = LHS.getConstantOperandVal(1); |
| return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, |
| DAG.getConstant(Log2_64(Mask), dl, MVT::i64), |
| Dest); |
| } |
| |
| return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); |
| } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { |
| // Don't combine AND since emitComparison converts the AND to an ANDS |
| // (a.k.a. TST) and the test in the test bit and branch instruction |
| // becomes redundant. This would also increase register pressure. |
| uint64_t SignBitPos; |
| std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); |
| return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, |
| DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); |
| } |
| } |
| if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && |
| LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { |
| // Don't combine AND since emitComparison converts the AND to an ANDS |
| // (a.k.a. TST) and the test in the test bit and branch instruction |
| // becomes redundant. This would also increase register pressure. |
| uint64_t SignBitPos; |
| std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); |
| return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, |
| DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); |
| } |
| |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, |
| Cmp); |
| } |
| |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || |
| LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); |
| |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally |
| // clean. Some of them require two branches to implement. |
| SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue BR1 = |
| DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); |
| if (CC2 != AArch64CC::AL) { |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, |
| Cmp); |
| } |
| |
| return BR1; |
| } |
| |
| SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| |
| SDValue In1 = Op.getOperand(0); |
| SDValue In2 = Op.getOperand(1); |
| EVT SrcVT = In2.getValueType(); |
| |
| if (VT.isScalableVector()) { |
| if (VT != SrcVT) |
| return SDValue(); |
| |
| // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK) |
| // |
| // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU; |
| // maybe useful for copysign operations with mismatched VTs. |
| // |
| // IntVT here is chosen so it's a legal type with the same element width |
| // as the input. |
| EVT IntVT = |
| getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); |
| unsigned NumBits = VT.getScalarSizeInBits(); |
| SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT); |
| SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT); |
| SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask, |
| getSVESafeBitCast(IntVT, In2, DAG)); |
| SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask, |
| getSVESafeBitCast(IntVT, In1, DAG)); |
| SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude); |
| return getSVESafeBitCast(VT, IntResult, DAG); |
| } |
| |
| if (SrcVT.bitsLT(VT)) |
| In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); |
| else if (SrcVT.bitsGT(VT)) |
| In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); |
| |
| EVT VecVT; |
| uint64_t EltMask; |
| SDValue VecVal1, VecVal2; |
| |
| auto setVecVal = [&] (int Idx) { |
| if (!VT.isVector()) { |
| VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, |
| DAG.getUNDEF(VecVT), In1); |
| VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, |
| DAG.getUNDEF(VecVT), In2); |
| } else { |
| VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); |
| VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); |
| } |
| }; |
| |
| if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { |
| VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); |
| EltMask = 0x80000000ULL; |
| setVecVal(AArch64::ssub); |
| } else if (VT == MVT::f64 || VT == MVT::v2f64) { |
| VecVT = MVT::v2i64; |
| |
| // We want to materialize a mask with the high bit set, but the AdvSIMD |
| // immediate moves cannot materialize that in a single instruction for |
| // 64-bit elements. Instead, materialize zero and then negate it. |
| EltMask = 0; |
| |
| setVecVal(AArch64::dsub); |
| } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { |
| VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); |
| EltMask = 0x8000ULL; |
| setVecVal(AArch64::hsub); |
| } else { |
| llvm_unreachable("Invalid type for copysign!"); |
| } |
| |
| SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); |
| |
| // If we couldn't materialize the mask above, then the mask vector will be |
| // the zero vector, and we need to negate it here. |
| if (VT == MVT::f64 || VT == MVT::v2f64) { |
| BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); |
| BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); |
| BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); |
| } |
| |
| SDValue Sel = |
| DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); |
| |
| if (VT == MVT::f16) |
| return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); |
| if (VT == MVT::f32) |
| return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); |
| else if (VT == MVT::f64) |
| return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); |
| else |
| return DAG.getNode(ISD::BITCAST, DL, VT, Sel); |
| } |
| |
| SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { |
| if (DAG.getMachineFunction().getFunction().hasFnAttribute( |
| Attribute::NoImplicitFloat)) |
| return SDValue(); |
| |
| if (!Subtarget->hasNEON()) |
| return SDValue(); |
| |
| // While there is no integer popcount instruction, it can |
| // be more efficiently lowered to the following sequence that uses |
| // AdvSIMD registers/instructions as long as the copies to/from |
| // the AdvSIMD registers are cheap. |
| // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd |
| // CNT V0.8B, V0.8B // 8xbyte pop-counts |
| // ADDV B0, V0.8B // sum 8xbyte pop-counts |
| // UMOV X0, V0.B[0] // copy byte result back to integer reg |
| SDValue Val = Op.getOperand(0); |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| |
| if (VT == MVT::i32 || VT == MVT::i64) { |
| if (VT == MVT::i32) |
| Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); |
| Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); |
| |
| SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); |
| SDValue UaddLV = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, |
| DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); |
| |
| if (VT == MVT::i64) |
| UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); |
| return UaddLV; |
| } else if (VT == MVT::i128) { |
| Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); |
| |
| SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); |
| SDValue UaddLV = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, |
| DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); |
| |
| return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); |
| } |
| |
| if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); |
| |
| assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || |
| VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && |
| "Unexpected type for custom ctpop lowering"); |
| |
| EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; |
| Val = DAG.getBitcast(VT8Bit, Val); |
| Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); |
| |
| // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. |
| unsigned EltSize = 8; |
| unsigned NumElts = VT.is64BitVector() ? 8 : 16; |
| while (EltSize != VT.getScalarSizeInBits()) { |
| EltSize *= 2; |
| NumElts /= 2; |
| MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); |
| Val = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); |
| } |
| |
| return Val; |
| } |
| |
| SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| assert(VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); |
| |
| SDLoc DL(Op); |
| SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); |
| return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); |
| } |
| |
| SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, |
| SelectionDAG &DAG) const { |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Opcode = Op.getOpcode(); |
| ISD::CondCode CC; |
| switch (Opcode) { |
| default: |
| llvm_unreachable("Wrong instruction"); |
| case ISD::SMAX: |
| CC = ISD::SETGT; |
| break; |
| case ISD::SMIN: |
| CC = ISD::SETLT; |
| break; |
| case ISD::UMAX: |
| CC = ISD::SETUGT; |
| break; |
| case ISD::UMIN: |
| CC = ISD::SETULT; |
| break; |
| } |
| |
| if (VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { |
| switch (Opcode) { |
| default: |
| llvm_unreachable("Wrong instruction"); |
| case ISD::SMAX: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, |
| /*OverrideNEON=*/true); |
| case ISD::SMIN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, |
| /*OverrideNEON=*/true); |
| case ISD::UMAX: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, |
| /*OverrideNEON=*/true); |
| case ISD::UMIN: |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, |
| /*OverrideNEON=*/true); |
| } |
| } |
| |
| SDValue Op0 = Op.getOperand(0); |
| SDValue Op1 = Op.getOperand(1); |
| SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); |
| return DAG.getSelect(DL, VT, Cond, Op0, Op1); |
| } |
| |
| SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| if (VT.isScalableVector() || |
| useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, |
| true); |
| |
| SDLoc DL(Op); |
| SDValue REVB; |
| MVT VST; |
| |
| switch (VT.getSimpleVT().SimpleTy) { |
| default: |
| llvm_unreachable("Invalid type for bitreverse!"); |
| |
| case MVT::v2i32: { |
| VST = MVT::v8i8; |
| REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v4i32: { |
| VST = MVT::v16i8; |
| REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v1i64: { |
| VST = MVT::v8i8; |
| REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| |
| case MVT::v2i64: { |
| VST = MVT::v16i8; |
| REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); |
| |
| break; |
| } |
| } |
| |
| return DAG.getNode(AArch64ISD::NVCAST, DL, VT, |
| DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { |
| |
| if (Op.getValueType().isVector()) |
| return LowerVSETCC(Op, DAG); |
| |
| bool IsStrict = Op->isStrictFPOpcode(); |
| bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; |
| unsigned OpNo = IsStrict ? 1 : 0; |
| SDValue Chain; |
| if (IsStrict) |
| Chain = Op.getOperand(0); |
| SDValue LHS = Op.getOperand(OpNo + 0); |
| SDValue RHS = Op.getOperand(OpNo + 1); |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); |
| SDLoc dl(Op); |
| |
| // We chose ZeroOrOneBooleanContents, so use zero and one. |
| EVT VT = Op.getValueType(); |
| SDValue TVal = DAG.getConstant(1, dl, VT); |
| SDValue FVal = DAG.getConstant(0, dl, VT); |
| |
| // Handle f128 first, since one possible outcome is a normal integer |
| // comparison which gets picked up by the next if statement. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, |
| IsSignaling); |
| |
| // If softenSetCCOperands returned a scalar, use it. |
| if (!RHS.getNode()) { |
| assert(LHS.getValueType() == Op.getValueType() && |
| "Unexpected setcc expansion!"); |
| return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; |
| } |
| } |
| |
| if (LHS.getValueType().isInteger()) { |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp( |
| LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); |
| |
| // Note that we inverted the condition above, so we reverse the order of |
| // the true and false operands here. This will allow the setcc to be |
| // matched to a single CSINC instruction. |
| SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); |
| return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; |
| } |
| |
| // Now we know we're dealing with FP values. |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || |
| LHS.getValueType() == MVT::f64); |
| |
| // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead |
| // and do the comparison. |
| SDValue Cmp; |
| if (IsStrict) |
| Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); |
| else |
| Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| SDValue Res; |
| if (CC2 == AArch64CC::AL) { |
| changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, |
| CC2); |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| |
| // Note that we inverted the condition above, so we reverse the order of |
| // the true and false operands here. This will allow the setcc to be |
| // matched to a single CSINC instruction. |
| Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); |
| } else { |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't |
| // totally clean. Some of them require two CSELs to implement. As is in |
| // this case, we emit the first CSEL and then emit a second using the output |
| // of the first as the RHS. We're effectively OR'ing the two CC's together. |
| |
| // FIXME: It would be nice if we could match the two CSELs to two CSINCs. |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue CS1 = |
| DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); |
| |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); |
| } |
| return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, |
| SDValue RHS, SDValue TVal, |
| SDValue FVal, const SDLoc &dl, |
| SelectionDAG &DAG) const { |
| // Handle f128 first, because it will result in a comparison of some RTLIB |
| // call result against zero. |
| if (LHS.getValueType() == MVT::f128) { |
| softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); |
| |
| // If softenSetCCOperands returned a scalar, we need to compare the result |
| // against zero to select between true and false values. |
| if (!RHS.getNode()) { |
| RHS = DAG.getConstant(0, dl, LHS.getValueType()); |
| CC = ISD::SETNE; |
| } |
| } |
| |
| // Also handle f16, for which we need to do a f32 comparison. |
| if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); |
| } |
| |
| // Next, handle integers. |
| if (LHS.getValueType().isInteger()) { |
| assert((LHS.getValueType() == RHS.getValueType()) && |
| (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); |
| |
| ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); |
| ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); |
| ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); |
| // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform |
| // into (OR (ASR lhs, N-1), 1), which requires less instructions for the |
| // supported types. |
| if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && |
| CTVal->isOne() && CFVal->isAllOnes() && |
| LHS.getValueType() == TVal.getValueType()) { |
| EVT VT = LHS.getValueType(); |
| SDValue Shift = |
| DAG.getNode(ISD::SRA, dl, VT, LHS, |
| DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); |
| return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); |
| } |
| |
| unsigned Opcode = AArch64ISD::CSEL; |
| |
| // If both the TVal and the FVal are constants, see if we can swap them in |
| // order to for a CSINV or CSINC out of them. |
| if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } else if (TVal.getOpcode() == ISD::XOR) { |
| // If TVal is a NOT we want to swap TVal and FVal so that we can match |
| // with a CSINV rather than a CSEL. |
| if (isAllOnesConstant(TVal.getOperand(1))) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| } else if (TVal.getOpcode() == ISD::SUB) { |
| // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so |
| // that we can match with a CSNEG rather than a CSEL. |
| if (isNullConstant(TVal.getOperand(0))) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| } else if (CTVal && CFVal) { |
| const int64_t TrueVal = CTVal->getSExtValue(); |
| const int64_t FalseVal = CFVal->getSExtValue(); |
| bool Swap = false; |
| |
| // If both TVal and FVal are constants, see if FVal is the |
| // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC |
| // instead of a CSEL in that case. |
| if (TrueVal == ~FalseVal) { |
| Opcode = AArch64ISD::CSINV; |
| } else if (FalseVal > std::numeric_limits<int64_t>::min() && |
| TrueVal == -FalseVal) { |
| Opcode = AArch64ISD::CSNEG; |
| } else if (TVal.getValueType() == MVT::i32) { |
| // If our operands are only 32-bit wide, make sure we use 32-bit |
| // arithmetic for the check whether we can use CSINC. This ensures that |
| // the addition in the check will wrap around properly in case there is |
| // an overflow (which would not be the case if we do the check with |
| // 64-bit arithmetic). |
| const uint32_t TrueVal32 = CTVal->getZExtValue(); |
| const uint32_t FalseVal32 = CFVal->getZExtValue(); |
| |
| if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { |
| Opcode = AArch64ISD::CSINC; |
| |
| if (TrueVal32 > FalseVal32) { |
| Swap = true; |
| } |
| } |
| // 64-bit check whether we can use CSINC. |
| } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { |
| Opcode = AArch64ISD::CSINC; |
| |
| if (TrueVal > FalseVal) { |
| Swap = true; |
| } |
| } |
| |
| // Swap TVal and FVal if necessary. |
| if (Swap) { |
| std::swap(TVal, FVal); |
| std::swap(CTVal, CFVal); |
| CC = ISD::getSetCCInverse(CC, LHS.getValueType()); |
| } |
| |
| if (Opcode != AArch64ISD::CSEL) { |
| // Drop FVal since we can get its value by simply inverting/negating |
| // TVal. |
| FVal = TVal; |
| } |
| } |
| |
| // Avoid materializing a constant when possible by reusing a known value in |
| // a register. However, don't perform this optimization if the known value |
| // is one, zero or negative one in the case of a CSEL. We can always |
| // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the |
| // FVal, respectively. |
| ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); |
| if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && |
| !RHSVal->isZero() && !RHSVal->isAllOnes()) { |
| AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); |
| // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to |
| // "a != C ? x : a" to avoid materializing C. |
| if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) |
| TVal = LHS; |
| else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) |
| FVal = LHS; |
| } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { |
| assert (CTVal && CFVal && "Expected constant operands for CSNEG."); |
| // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to |
| // avoid materializing C. |
| AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); |
| if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { |
| Opcode = AArch64ISD::CSINV; |
| TVal = LHS; |
| FVal = DAG.getConstant(0, dl, FVal.getValueType()); |
| } |
| } |
| |
| SDValue CCVal; |
| SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); |
| EVT VT = TVal.getValueType(); |
| return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); |
| } |
| |
| // Now we know we're dealing with FP values. |
| assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || |
| LHS.getValueType() == MVT::f64); |
| assert(LHS.getValueType() == RHS.getValueType()); |
| EVT VT = TVal.getValueType(); |
| SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
| |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally |
| // clean. Some of them require two CSELs to implement. |
| AArch64CC::CondCode CC1, CC2; |
| changeFPCCToAArch64CC(CC, CC1, CC2); |
| |
| if (DAG.getTarget().Options.UnsafeFPMath) { |
| // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and |
| // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. |
| ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); |
| if (RHSVal && RHSVal->isZero()) { |
| ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); |
| ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); |
| |
| if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && |
| CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) |
| TVal = LHS; |
| else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && |
| CFVal && CFVal->isZero() && |
| FVal.getValueType() == LHS.getValueType()) |
| FVal = LHS; |
| } |
| } |
| |
| // Emit first, and possibly only, CSEL. |
| SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); |
| SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); |
| |
| // If we need a second CSEL, emit it, using the output of the first as the |
| // RHS. We're effectively OR'ing the two CC's together. |
| if (CC2 != AArch64CC::AL) { |
| SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); |
| return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); |
| } |
| |
| // Otherwise, return the output of the first CSEL. |
| return CS1; |
| } |
| |
| SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT Ty = Op.getValueType(); |
| auto Idx = Op.getConstantOperandAPInt(2); |
| |
| // This will select to an EXT instruction, which has a maximum immediate |
| // value of 255, hence 2048-bits is the maximum value we can lower. |
| if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits())) |
| return Op; |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, |
| SelectionDAG &DAG) const { |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| SDValue TVal = Op.getOperand(2); |
| SDValue FVal = Op.getOperand(3); |
| SDLoc DL(Op); |
| return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDValue CCVal = Op->getOperand(0); |
| SDValue TVal = Op->getOperand(1); |
| SDValue FVal = Op->getOperand(2); |
| SDLoc DL(Op); |
| |
| EVT Ty = Op.getValueType(); |
| if (Ty.isScalableVector()) { |
| SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); |
| MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); |
| SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); |
| return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(Ty)) { |
| // FIXME: Ideally this would be the same as above using i1 types, however |
| // for the moment we can't deal with fixed i1 vector types properly, so |
| // instead extend the predicate to a result type sized integer vector. |
| MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); |
| MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); |
| SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); |
| SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); |
| return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); |
| } |
| |
| // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select |
| // instruction. |
| if (ISD::isOverflowIntrOpRes(CCVal)) { |
| // Only lower legal XALUO ops. |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) |
| return SDValue(); |
| |
| AArch64CC::CondCode OFCC; |
| SDValue Value, Overflow; |
| std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); |
| SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); |
| |
| return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, |
| CCVal, Overflow); |
| } |
| |
| // Lower it the same way as we would lower a SELECT_CC node. |
| ISD::CondCode CC; |
| SDValue LHS, RHS; |
| if (CCVal.getOpcode() == ISD::SETCC) { |
| LHS = CCVal.getOperand(0); |
| RHS = CCVal.getOperand(1); |
| CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get(); |
| } else { |
| LHS = CCVal; |
| RHS = DAG.getConstant(0, DL, CCVal.getValueType()); |
| CC = ISD::SETNE; |
| } |
| return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Jump table entries as PC relative offsets. No additional tweaking |
| // is necessary here. Just get the address of the jump table. |
| JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| !Subtarget->isTargetMachO()) { |
| return getAddrLarge(JT, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(JT, DAG); |
| } |
| return getAddr(JT, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, |
| SelectionDAG &DAG) const { |
| // Jump table entries as PC relative offsets. No additional tweaking |
| // is necessary here. Just get the address of the jump table. |
| SDLoc DL(Op); |
| SDValue JT = Op.getOperand(1); |
| SDValue Entry = Op.getOperand(2); |
| int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); |
| |
| auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| AFI->setJumpTableEntryInfo(JTI, 4, nullptr); |
| |
| SDNode *Dest = |
| DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, |
| Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); |
| return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), |
| SDValue(Dest, 0)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, |
| SelectionDAG &DAG) const { |
| ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); |
| |
| if (getTargetMachine().getCodeModel() == CodeModel::Large) { |
| // Use the GOT for the large code model on iOS. |
| if (Subtarget->isTargetMachO()) { |
| return getGOT(CP, DAG); |
| } |
| return getAddrLarge(CP, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(CP, DAG); |
| } else { |
| return getAddr(CP, DAG); |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); |
| if (getTargetMachine().getCodeModel() == CodeModel::Large && |
| !Subtarget->isTargetMachO()) { |
| return getAddrLarge(BA, DAG); |
| } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { |
| return getAddrTiny(BA, DAG); |
| } |
| return getAddr(BA, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| AArch64FunctionInfo *FuncInfo = |
| DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| |
| SDLoc DL(Op); |
| SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), |
| getPointerTy(DAG.getDataLayout())); |
| FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), |
| MachinePointerInfo(SV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| AArch64FunctionInfo *FuncInfo = |
| DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); |
| |
| SDLoc DL(Op); |
| SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 |
| ? FuncInfo->getVarArgsGPRIndex() |
| : FuncInfo->getVarArgsStackIndex(), |
| getPointerTy(DAG.getDataLayout())); |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), |
| MachinePointerInfo(SV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| // The layout of the va_list struct is specified in the AArch64 Procedure Call |
| // Standard, section B.3. |
| MachineFunction &MF = DAG.getMachineFunction(); |
| AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
| unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; |
| auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| |
| SDValue Chain = Op.getOperand(0); |
| SDValue VAList = Op.getOperand(1); |
| const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| SmallVector<SDValue, 4> MemOps; |
| |
| // void *__stack at offset 0 |
| unsigned Offset = 0; |
| SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); |
| Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); |
| MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, |
| MachinePointerInfo(SV), Align(PtrSize))); |
| |
| // void *__gr_top at offset 8 (4 on ILP32) |
| Offset += PtrSize; |
| int GPRSize = FuncInfo->getVarArgsGPRSize(); |
| if (GPRSize > 0) { |
| SDValue GRTop, GRTopAddr; |
| |
| GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| |
| GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); |
| GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, |
| DAG.getConstant(GPRSize, DL, PtrVT)); |
| GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); |
| |
| MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, |
| MachinePointerInfo(SV, Offset), |
| Align(PtrSize))); |
| } |
| |
| // void *__vr_top at offset 16 (8 on ILP32) |
| Offset += PtrSize; |
| int FPRSize = FuncInfo->getVarArgsFPRSize(); |
| if (FPRSize > 0) { |
| SDValue VRTop, VRTopAddr; |
| VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| |
| VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); |
| VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, |
| DAG.getConstant(FPRSize, DL, PtrVT)); |
| VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); |
| |
| MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, |
| MachinePointerInfo(SV, Offset), |
| Align(PtrSize))); |
| } |
| |
| // int __gr_offs at offset 24 (12 on ILP32) |
| Offset += PtrSize; |
| SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| MemOps.push_back( |
| DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), |
| GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); |
| |
| // int __vr_offs at offset 28 (16 on ILP32) |
| Offset += 4; |
| SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Offset, DL, PtrVT)); |
| MemOps.push_back( |
| DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), |
| VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); |
| |
| return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| |
| if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) |
| return LowerWin64_VASTART(Op, DAG); |
| else if (Subtarget->isTargetDarwin()) |
| return LowerDarwin_VASTART(Op, DAG); |
| else |
| return LowerAAPCS_VASTART(Op, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, |
| SelectionDAG &DAG) const { |
| // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single |
| // pointer. |
| SDLoc DL(Op); |
| unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; |
| unsigned VaListSize = |
| (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) |
| ? PtrSize |
| : Subtarget->isTargetILP32() ? 20 : 32; |
| const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); |
| const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); |
| |
| return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), |
| DAG.getConstant(VaListSize, DL, MVT::i32), |
| Align(PtrSize), false, false, false, |
| MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetDarwin() && |
| "automatic va_arg instruction only works on Darwin"); |
| |
| const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| SDValue Chain = Op.getOperand(0); |
| SDValue Addr = Op.getOperand(1); |
| MaybeAlign Align(Op.getConstantOperandVal(3)); |
| unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); |
| SDValue VAList = |
| DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); |
| Chain = VAList.getValue(1); |
| VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); |
| |
| if (VT.isScalableVector()) |
| report_fatal_error("Passing SVE types to variadic functions is " |
| "currently not supported"); |
| |
| if (Align && *Align > MinSlotSize) { |
| VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(Align->value() - 1, DL, PtrVT)); |
| VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, |
| DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); |
| } |
| |
| Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); |
| unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); |
| |
| // Scalar integer and FP values smaller than 64 bits are implicitly extended |
| // up to 64 bits. At the very least, we have to increase the striding of the |
| // vaargs list to match this, and for FP values we need to introduce |
| // FP_ROUND nodes as well. |
| if (VT.isInteger() && !VT.isVector()) |
| ArgSize = std::max(ArgSize, MinSlotSize); |
| bool NeedFPTrunc = false; |
| if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { |
| ArgSize = 8; |
| NeedFPTrunc = true; |
| } |
| |
| // Increment the pointer, VAList, to the next vaarg |
| SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, |
| DAG.getConstant(ArgSize, DL, PtrVT)); |
| VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); |
| |
| // Store the incremented VAList to the legalized pointer |
| SDValue APStore = |
| DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); |
| |
| // Load the actual argument out of the pointer VAList |
| if (NeedFPTrunc) { |
| // Load the value as an f64. |
| SDValue WideFP = |
| DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); |
| // Round the value down to an f32. |
| SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), |
| DAG.getIntPtrConstant(1, DL)); |
| SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; |
| // Merge the rounded value with the chain output of the load. |
| return DAG.getMergeValues(Ops, DL); |
| } |
| |
| return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); |
| } |
| |
| SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| MFI.setFrameAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDValue FrameAddr = |
| DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); |
| while (Depth--) |
| FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, |
| MachinePointerInfo()); |
| |
| if (Subtarget->isTargetILP32()) |
| FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, |
| DAG.getValueType(VT)); |
| |
| return FrameAddr; |
| } |
| |
| SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| |
| EVT VT = getPointerTy(DAG.getDataLayout()); |
| SDLoc DL(Op); |
| int FI = MFI.CreateFixedObject(4, 0, false); |
| return DAG.getFrameIndex(FI, VT); |
| } |
| |
| #define GET_REGISTER_MATCHER |
| #include "AArch64GenAsmMatcher.inc" |
| |
| // FIXME? Maybe this could be a TableGen attribute on some registers and |
| // this table could be generated automatically from RegInfo. |
| Register AArch64TargetLowering:: |
| getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { |
| Register Reg = MatchRegisterName(RegName); |
| if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { |
| const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); |
| unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); |
| if (!Subtarget->isXRegisterReserved(DwarfRegNum)) |
| Reg = 0; |
| } |
| if (Reg) |
| return Reg; |
| report_fatal_error(Twine("Invalid register name \"" |
| + StringRef(RegName) + "\".")); |
| } |
| |
| SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| |
| SDValue FrameAddr = |
| DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); |
| SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); |
| |
| return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); |
| } |
| |
| SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, |
| SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| MFI.setReturnAddressIsTaken(true); |
| |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); |
| SDValue ReturnAddress; |
| if (Depth) { |
| SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); |
| SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); |
| ReturnAddress = DAG.getLoad( |
| VT, DL, DAG.getEntryNode(), |
| DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); |
| } else { |
| // Return LR, which contains the return address. Mark it an implicit |
| // live-in. |
| unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); |
| ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); |
| } |
| |
| // The XPACLRI instruction assembles to a hint-space instruction before |
| // Armv8.3-A therefore this instruction can be safely used for any pre |
| // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use |
| // that instead. |
| SDNode *St; |
| if (Subtarget->hasPAuth()) { |
| St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); |
| } else { |
| // XPACLRI operates on LR therefore we must move the operand accordingly. |
| SDValue Chain = |
| DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); |
| St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); |
| } |
| return SDValue(St, 0); |
| } |
| |
| /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two |
| /// i32 values and take a 2 x i32 value to shift plus a shift amount. |
| SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDValue Lo, Hi; |
| expandShiftParts(Op.getNode(), Lo, Hi, DAG); |
| return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); |
| } |
| |
| bool AArch64TargetLowering::isOffsetFoldingLegal( |
| const GlobalAddressSDNode *GA) const { |
| // Offsets are folded in the DAG combine rather than here so that we can |
| // intelligently choose an offset based on the uses. |
| return false; |
| } |
| |
| bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, |
| bool OptForSize) const { |
| bool IsLegal = false; |
| // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and |
| // 16-bit case when target has full fp16 support. |
| // FIXME: We should be able to handle f128 as well with a clever lowering. |
| const APInt ImmInt = Imm.bitcastToAPInt(); |
| if (VT == MVT::f64) |
| IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); |
| else if (VT == MVT::f32) |
| IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); |
| else if (VT == MVT::f16 && Subtarget->hasFullFP16()) |
| IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); |
| // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to |
| // generate that fmov. |
| |
| // If we can not materialize in immediate field for fmov, check if the |
| // value can be encoded as the immediate operand of a logical instruction. |
| // The immediate value will be created with either MOVZ, MOVN, or ORR. |
| if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { |
| // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; |
| // however the mov+fmov sequence is always better because of the reduced |
| // cache pressure. The timings are still the same if you consider |
| // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the |
| // movw+movk is fused). So we limit up to 2 instrdduction at most. |
| SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
| AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), |
| Insn); |
| unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); |
| IsLegal = Insn.size() <= Limit; |
| } |
| |
| LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() |
| << " imm value: "; Imm.dump();); |
| return IsLegal; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Optimization Hooks |
| //===----------------------------------------------------------------------===// |
| |
| static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, |
| SDValue Operand, SelectionDAG &DAG, |
| int &ExtraSteps) { |
| EVT VT = Operand.getValueType(); |
| if ((ST->hasNEON() && |
| (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || |
| VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || |
| VT == MVT::v4f32)) || |
| (ST->hasSVE() && |
| (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { |
| if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) |
| // For the reciprocal estimates, convergence is quadratic, so the number |
| // of digits is doubled after each iteration. In ARMv8, the accuracy of |
| // the initial estimate is 2^-8. Thus the number of extra steps to refine |
| // the result for float (23 mantissa bits) is 2 and for double (52 |
| // mantissa bits) is 3. |
| ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; |
| |
| return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue |
| AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, |
| const DenormalMode &Mode) const { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); |
| SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); |
| return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); |
| } |
| |
| SDValue |
| AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, |
| SelectionDAG &DAG) const { |
| return Op; |
| } |
| |
| SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, |
| SelectionDAG &DAG, int Enabled, |
| int &ExtraSteps, |
| bool &UseOneConst, |
| bool Reciprocal) const { |
| if (Enabled == ReciprocalEstimate::Enabled || |
| (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) |
| if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, |
| DAG, ExtraSteps)) { |
| SDLoc DL(Operand); |
| EVT VT = Operand.getValueType(); |
| |
| SDNodeFlags Flags; |
| Flags.setAllowReassociation(true); |
| |
| // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) |
| // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) |
| for (int i = ExtraSteps; i > 0; --i) { |
| SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, |
| Flags); |
| Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); |
| } |
| if (!Reciprocal) |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); |
| |
| ExtraSteps = 0; |
| return Estimate; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, |
| SelectionDAG &DAG, int Enabled, |
| int &ExtraSteps) const { |
| if (Enabled == ReciprocalEstimate::Enabled) |
| if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, |
| DAG, ExtraSteps)) { |
| SDLoc DL(Operand); |
| EVT VT = Operand.getValueType(); |
| |
| SDNodeFlags Flags; |
| Flags.setAllowReassociation(true); |
| |
| // Newton reciprocal iteration: E * (2 - X * E) |
| // AArch64 reciprocal iteration instruction: (2 - M * N) |
| for (int i = ExtraSteps; i > 0; --i) { |
| SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, |
| Estimate, Flags); |
| Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); |
| } |
| |
| ExtraSteps = 0; |
| return Estimate; |
| } |
| |
| return SDValue(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Inline Assembly Support |
| //===----------------------------------------------------------------------===// |
| |
| // Table of Constraints |
| // TODO: This is the current set of constraints supported by ARM for the |
| // compiler, not all of them may make sense. |
| // |
| // r - A general register |
| // w - An FP/SIMD register of some size in the range v0-v31 |
| // x - An FP/SIMD register of some size in the range v0-v15 |
| // I - Constant that can be used with an ADD instruction |
| // J - Constant that can be used with a SUB instruction |
| // K - Constant that can be used with a 32-bit logical instruction |
| // L - Constant that can be used with a 64-bit logical instruction |
| // M - Constant that can be used as a 32-bit MOV immediate |
| // N - Constant that can be used as a 64-bit MOV immediate |
| // Q - A memory reference with base register and no offset |
| // S - A symbolic address |
| // Y - Floating point constant zero |
| // Z - Integer constant zero |
| // |
| // Note that general register operands will be output using their 64-bit x |
| // register name, whatever the size of the variable, unless the asm operand |
| // is prefixed by the %w modifier. Floating-point and SIMD register operands |
| // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or |
| // %q modifier. |
| const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { |
| // At this point, we have to lower this constraint to something else, so we |
| // lower it to an "r" or "w". However, by doing this we will force the result |
| // to be in register, while the X constraint is much more permissive. |
| // |
| // Although we are correct (we are free to emit anything, without |
| // constraints), we might break use cases that would expect us to be more |
| // efficient and emit something else. |
| if (!Subtarget->hasFPARMv8()) |
| return "r"; |
| |
| if (ConstraintVT.isFloatingPoint()) |
| return "w"; |
| |
| if (ConstraintVT.isVector() && |
| (ConstraintVT.getSizeInBits() == 64 || |
| ConstraintVT.getSizeInBits() == 128)) |
| return "w"; |
| |
| return "r"; |
| } |
| |
| enum PredicateConstraint { |
| Upl, |
| Upa, |
| Invalid |
| }; |
| |
| static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { |
| PredicateConstraint P = PredicateConstraint::Invalid; |
| if (Constraint == "Upa") |
| P = PredicateConstraint::Upa; |
| if (Constraint == "Upl") |
| P = PredicateConstraint::Upl; |
| return P; |
| } |
| |
| /// getConstraintType - Given a constraint letter, return the type of |
| /// constraint it is for this target. |
| AArch64TargetLowering::ConstraintType |
| AArch64TargetLowering::getConstraintType(StringRef Constraint) const { |
| if (Constraint.size() == 1) { |
| switch (Constraint[0]) { |
| default: |
| break; |
| case 'x': |
| case 'w': |
| case 'y': |
| return C_RegisterClass; |
| // An address with a single base register. Due to the way we |
| // currently handle addresses it is the same as 'r'. |
| case 'Q': |
| return C_Memory; |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'Y': |
| case 'Z': |
| return C_Immediate; |
| case 'z': |
| case 'S': // A symbolic address |
| return C_Other; |
| } |
| } else if (parsePredicateConstraint(Constraint) != |
| PredicateConstraint::Invalid) |
| return C_RegisterClass; |
| return TargetLowering::getConstraintType(Constraint); |
| } |
| |
| /// Examine constraint type and operand type and determine a weight value. |
| /// This object must already have been set up with the operand type |
| /// and the current alternative constraint selected. |
| TargetLowering::ConstraintWeight |
| AArch64TargetLowering::getSingleConstraintMatchWeight( |
| AsmOperandInfo &info, const char *constraint) const { |
| ConstraintWeight weight = CW_Invalid; |
| Value *CallOperandVal = info.CallOperandVal; |
| // If we don't have a value, we can't do a match, |
| // but allow it at the lowest weight. |
| if (!CallOperandVal) |
| return CW_Default; |
| Type *type = CallOperandVal->getType(); |
| // Look at the constraint type. |
| switch (*constraint) { |
| default: |
| weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); |
| break; |
| case 'x': |
| case 'w': |
| case 'y': |
| if (type->isFloatingPointTy() || type->isVectorTy()) |
| weight = CW_Register; |
| break; |
| case 'z': |
| weight = CW_Constant; |
| break; |
| case 'U': |
| if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) |
| weight = CW_Register; |
| break; |
| } |
| return weight; |
| } |
| |
| std::pair<unsigned, const TargetRegisterClass *> |
| AArch64TargetLowering::getRegForInlineAsmConstraint( |
| const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { |
| if (Constraint.size() == 1) { |
| switch (Constraint[0]) { |
| case 'r': |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, nullptr); |
| if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) |
| return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); |
| if (VT.getFixedSizeInBits() == 64) |
| return std::make_pair(0U, &AArch64::GPR64commonRegClass); |
| return std::make_pair(0U, &AArch64::GPR32commonRegClass); |
| case 'w': { |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) { |
| if (VT.getVectorElementType() != MVT::i1) |
| return std::make_pair(0U, &AArch64::ZPRRegClass); |
| return std::make_pair(0U, nullptr); |
| } |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| if (VTSize == 16) |
| return std::make_pair(0U, &AArch64::FPR16RegClass); |
| if (VTSize == 32) |
| return std::make_pair(0U, &AArch64::FPR32RegClass); |
| if (VTSize == 64) |
| return std::make_pair(0U, &AArch64::FPR64RegClass); |
| if (VTSize == 128) |
| return std::make_pair(0U, &AArch64::FPR128RegClass); |
| break; |
| } |
| // The instructions that this constraint is designed for can |
| // only take 128-bit registers so just use that regclass. |
| case 'x': |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, &AArch64::ZPR_4bRegClass); |
| if (VT.getSizeInBits() == 128) |
| return std::make_pair(0U, &AArch64::FPR128_loRegClass); |
| break; |
| case 'y': |
| if (!Subtarget->hasFPARMv8()) |
| break; |
| if (VT.isScalableVector()) |
| return std::make_pair(0U, &AArch64::ZPR_3bRegClass); |
| break; |
| } |
| } else { |
| PredicateConstraint PC = parsePredicateConstraint(Constraint); |
| if (PC != PredicateConstraint::Invalid) { |
| if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) |
| return std::make_pair(0U, nullptr); |
| bool restricted = (PC == PredicateConstraint::Upl); |
| return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) |
| : std::make_pair(0U, &AArch64::PPRRegClass); |
| } |
| } |
| if (StringRef("{cc}").equals_insensitive(Constraint)) |
| return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); |
| |
| // Use the default implementation in TargetLowering to convert the register |
| // constraint into a member of a register class. |
| std::pair<unsigned, const TargetRegisterClass *> Res; |
| Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
| |
| // Not found as a standard register? |
| if (!Res.second) { |
| unsigned Size = Constraint.size(); |
| if ((Size == 4 || Size == 5) && Constraint[0] == '{' && |
| tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { |
| int RegNo; |
| bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); |
| if (!Failed && RegNo >= 0 && RegNo <= 31) { |
| // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. |
| // By default we'll emit v0-v31 for this unless there's a modifier where |
| // we'll emit the correct register as well. |
| if (VT != MVT::Other && VT.getSizeInBits() == 64) { |
| Res.first = AArch64::FPR64RegClass.getRegister(RegNo); |
| Res.second = &AArch64::FPR64RegClass; |
| } else { |
| Res.first = AArch64::FPR128RegClass.getRegister(RegNo); |
| Res.second = &AArch64::FPR128RegClass; |
| } |
| } |
| } |
| } |
| |
| if (Res.second && !Subtarget->hasFPARMv8() && |
| !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && |
| !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) |
| return std::make_pair(0U, nullptr); |
| |
| return Res; |
| } |
| |
| EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, |
| llvm::Type *Ty, |
| bool AllowUnknown) const { |
| if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) |
| return EVT(MVT::i64x8); |
| |
| return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); |
| } |
| |
| /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops |
| /// vector. If it is invalid, don't add anything to Ops. |
| void AArch64TargetLowering::LowerAsmOperandForConstraint( |
| SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, |
| SelectionDAG &DAG) const { |
| SDValue Result; |
| |
| // Currently only support length 1 constraints. |
| if (Constraint.length() != 1) |
| return; |
| |
| char ConstraintLetter = Constraint[0]; |
| switch (ConstraintLetter) { |
| default: |
| break; |
| |
| // This set of constraints deal with valid constants for various instructions. |
| // Validate and return a target constant for them if we can. |
| case 'z': { |
| // 'z' maps to xzr or wzr so it needs an input of 0. |
| if (!isNullConstant(Op)) |
| return; |
| |
| if (Op.getValueType() == MVT::i64) |
| Result = DAG.getRegister(AArch64::XZR, MVT::i64); |
| else |
| Result = DAG.getRegister(AArch64::WZR, MVT::i32); |
| break; |
| } |
| case 'S': { |
| // An absolute symbolic address or label reference. |
| if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { |
| Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), |
| GA->getValueType(0)); |
| } else if (const BlockAddressSDNode *BA = |
| dyn_cast<BlockAddressSDNode>(Op)) { |
| Result = |
| DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); |
| } else |
| return; |
| break; |
| } |
| |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); |
| if (!C) |
| return; |
| |
| // Grab the value and do some validation. |
| uint64_t CVal = C->getZExtValue(); |
| switch (ConstraintLetter) { |
| // The I constraint applies only to simple ADD or SUB immediate operands: |
| // i.e. 0 to 4095 with optional shift by 12 |
| // The J constraint applies only to ADD or SUB immediates that would be |
| // valid when negated, i.e. if [an add pattern] were to be output as a SUB |
| // instruction [or vice versa], in other words -1 to -4095 with optional |
| // left shift by 12. |
| case 'I': |
| if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) |
| break; |
| return; |
| case 'J': { |
| uint64_t NVal = -C->getSExtValue(); |
| if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { |
| CVal = C->getSExtValue(); |
| break; |
| } |
| return; |
| } |
| // The K and L constraints apply *only* to logical immediates, including |
| // what used to be the MOVI alias for ORR (though the MOVI alias has now |
| // been removed and MOV should be used). So these constraints have to |
| // distinguish between bit patterns that are valid 32-bit or 64-bit |
| // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but |
| // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice |
| // versa. |
| case 'K': |
| if (AArch64_AM::isLogicalImmediate(CVal, 32)) |
| break; |
| return; |
| case 'L': |
| if (AArch64_AM::isLogicalImmediate(CVal, 64)) |
| break; |
| return; |
| // The M and N constraints are a superset of K and L respectively, for use |
| // with the MOV (immediate) alias. As well as the logical immediates they |
| // also match 32 or 64-bit immediates that can be loaded either using a |
| // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca |
| // (M) or 64-bit 0x1234000000000000 (N) etc. |
| // As a note some of this code is liberally stolen from the asm parser. |
| case 'M': { |
| if (!isUInt<32>(CVal)) |
| return; |
| if (AArch64_AM::isLogicalImmediate(CVal, 32)) |
| break; |
| if ((CVal & 0xFFFF) == CVal) |
| break; |
| if ((CVal & 0xFFFF0000ULL) == CVal) |
| break; |
| uint64_t NCVal = ~(uint32_t)CVal; |
| if ((NCVal & 0xFFFFULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF0000ULL) == NCVal) |
| break; |
| return; |
| } |
| case 'N': { |
| if (AArch64_AM::isLogicalImmediate(CVal, 64)) |
| break; |
| if ((CVal & 0xFFFFULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF0000ULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF00000000ULL) == CVal) |
| break; |
| if ((CVal & 0xFFFF000000000000ULL) == CVal) |
| break; |
| uint64_t NCVal = ~CVal; |
| if ((NCVal & 0xFFFFULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF0000ULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF00000000ULL) == NCVal) |
| break; |
| if ((NCVal & 0xFFFF000000000000ULL) == NCVal) |
| break; |
| return; |
| } |
| default: |
| return; |
| } |
| |
| // All assembler immediates are 64-bit integers. |
| Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); |
| break; |
| } |
| |
| if (Result.getNode()) { |
| Ops.push_back(Result); |
| return; |
| } |
| |
| return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AArch64 Advanced SIMD Support |
| //===----------------------------------------------------------------------===// |
| |
| /// WidenVector - Given a value in the V64 register class, produce the |
| /// equivalent value in the V128 register class. |
| static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { |
| EVT VT = V64Reg.getValueType(); |
| unsigned NarrowSize = VT.getVectorNumElements(); |
| MVT EltTy = VT.getVectorElementType().getSimpleVT(); |
| MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); |
| SDLoc DL(V64Reg); |
| |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), |
| V64Reg, DAG.getConstant(0, DL, MVT::i64)); |
| } |
| |
| /// getExtFactor - Determine the adjustment factor for the position when |
| /// generating an "extract from vector registers" instruction. |
| static unsigned getExtFactor(SDValue &V) { |
| EVT EltType = V.getValueType().getVectorElementType(); |
| return EltType.getSizeInBits() / 8; |
| } |
| |
| /// NarrowVector - Given a value in the V128 register class, produce the |
| /// equivalent value in the V64 register class. |
| static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { |
| EVT VT = V128Reg.getValueType(); |
| unsigned WideSize = VT.getVectorNumElements(); |
| MVT EltTy = VT.getVectorElementType().getSimpleVT(); |
| MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); |
| SDLoc DL(V128Reg); |
| |
| return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); |
| } |
| |
| // Gather data to see if the operation can be modelled as a |
| // shuffle in combination with VEXTs. |
| SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); |
| LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| assert(!VT.isScalableVector() && |
| "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| struct ShuffleSourceInfo { |
| SDValue Vec; |
| unsigned MinElt; |
| unsigned MaxElt; |
| |
| // We may insert some combination of BITCASTs and VEXT nodes to force Vec to |
| // be compatible with the shuffle we intend to construct. As a result |
| // ShuffleVec will be some sliding window into the original Vec. |
| SDValue ShuffleVec; |
| |
| // Code should guarantee that element i in Vec starts at element "WindowBase |
| // + i * WindowScale in ShuffleVec". |
| int WindowBase; |
| int WindowScale; |
| |
| ShuffleSourceInfo(SDValue Vec) |
| : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), |
| ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} |
| |
| bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } |
| }; |
| |
| // First gather all vectors used as an immediate source for this BUILD_VECTOR |
| // node. |
| SmallVector<ShuffleSourceInfo, 2> Sources; |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| if (V.isUndef()) |
| continue; |
| else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| !isa<ConstantSDNode>(V.getOperand(1))) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: " |
| "a shuffle can only come from building a vector from " |
| "various elements of other vectors, provided their " |
| "indices are constant\n"); |
| return SDValue(); |
| } |
| |
| // Add this element source to the list if it's not already there. |
| SDValue SourceVec = V.getOperand(0); |
| auto Source = find(Sources, SourceVec); |
| if (Source == Sources.end()) |
| Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); |
| |
| // Update the minimum and maximum lane number seen. |
| unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); |
| Source->MinElt = std::min(Source->MinElt, EltNo); |
| Source->MaxElt = std::max(Source->MaxElt, EltNo); |
| } |
| |
| if (Sources.size() > 2) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: currently only do something sane when at " |
| "most two source vectors are involved\n"); |
| return SDValue(); |
| } |
| |
| // Find out the smallest element size among result and two sources, and use |
| // it as element size to build the shuffle_vector. |
| EVT SmallestEltTy = VT.getVectorElementType(); |
| for (auto &Source : Sources) { |
| EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); |
| if (SrcEltTy.bitsLT(SmallestEltTy)) { |
| SmallestEltTy = SrcEltTy; |
| } |
| } |
| unsigned ResMultiplier = |
| VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); |
| uint64_t VTSize = VT.getFixedSizeInBits(); |
| NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); |
| EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); |
| |
| // If the source vector is too wide or too narrow, we may nevertheless be able |
| // to construct a compatible shuffle either by concatenating it with UNDEF or |
| // extracting a suitable range of elements. |
| for (auto &Src : Sources) { |
| EVT SrcVT = Src.ShuffleVec.getValueType(); |
| |
| uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); |
| if (SrcVTSize == VTSize) |
| continue; |
| |
| // This stage of the search produces a source with the same element type as |
| // the original, but with a total width matching the BUILD_VECTOR output. |
| EVT EltVT = SrcVT.getVectorElementType(); |
| unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); |
| EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); |
| |
| if (SrcVTSize < VTSize) { |
| assert(2 * SrcVTSize == VTSize); |
| // We can pad out the smaller vector for free, so if it's part of a |
| // shuffle... |
| Src.ShuffleVec = |
| DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, |
| DAG.getUNDEF(Src.ShuffleVec.getValueType())); |
| continue; |
| } |
| |
| if (SrcVTSize != 2 * VTSize) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: result vector too small to extract\n"); |
| return SDValue(); |
| } |
| |
| if (Src.MaxElt - Src.MinElt >= NumSrcElts) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); |
| return SDValue(); |
| } |
| |
| if (Src.MinElt >= NumSrcElts) { |
| // The extraction can just take the second half |
| Src.ShuffleVec = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(NumSrcElts, dl, MVT::i64)); |
| Src.WindowBase = -NumSrcElts; |
| } else if (Src.MaxElt < NumSrcElts) { |
| // The extraction can just take the first half |
| Src.ShuffleVec = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(0, dl, MVT::i64)); |
| } else { |
| // An actual VEXT is needed |
| SDValue VEXTSrc1 = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(0, dl, MVT::i64)); |
| SDValue VEXTSrc2 = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, |
| DAG.getConstant(NumSrcElts, dl, MVT::i64)); |
| unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); |
| |
| if (!SrcVT.is64BitVector()) { |
| LLVM_DEBUG( |
| dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " |
| "for SVE vectors."); |
| return SDValue(); |
| } |
| |
| Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, |
| VEXTSrc2, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| Src.WindowBase = -Src.MinElt; |
| } |
| } |
| |
| // Another possible incompatibility occurs from the vector element types. We |
| // can fix this by bitcasting the source vectors to the same type we intend |
| // for the shuffle. |
| for (auto &Src : Sources) { |
| EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); |
| if (SrcEltTy == SmallestEltTy) |
| continue; |
| assert(ShuffleVT.getVectorElementType() == SmallestEltTy); |
| Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); |
| Src.WindowScale = |
| SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); |
| Src.WindowBase *= Src.WindowScale; |
| } |
| |
| // Final check before we try to actually produce a shuffle. |
| LLVM_DEBUG(for (auto Src |
| : Sources) |
| assert(Src.ShuffleVec.getValueType() == ShuffleVT);); |
| |
| // The stars all align, our next step is to produce the mask for the shuffle. |
| SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); |
| int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); |
| for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { |
| SDValue Entry = Op.getOperand(i); |
| if (Entry.isUndef()) |
| continue; |
| |
| auto Src = find(Sources, Entry.getOperand(0)); |
| int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); |
| |
| // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit |
| // trunc. So only std::min(SrcBits, DestBits) actually get defined in this |
| // segment. |
| EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); |
| int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), |
| VT.getScalarSizeInBits()); |
| int LanesDefined = BitsDefined / BitsPerShuffleLane; |
| |
| // This source is expected to fill ResMultiplier lanes of the final shuffle, |
| // starting at the appropriate offset. |
| int *LaneMask = &Mask[i * ResMultiplier]; |
| |
| int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; |
| ExtractBase += NumElts * (Src - Sources.begin()); |
| for (int j = 0; j < LanesDefined; ++j) |
| LaneMask[j] = ExtractBase + j; |
| } |
| |
| // Final check before we try to produce nonsense... |
| if (!isShuffleMaskLegal(Mask, ShuffleVT)) { |
| LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); |
| return SDValue(); |
| } |
| |
| SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; |
| for (unsigned i = 0; i < Sources.size(); ++i) |
| ShuffleOps[i] = Sources[i].ShuffleVec; |
| |
| SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], |
| ShuffleOps[1], Mask); |
| SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); |
| |
| LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); |
| dbgs() << "Reshuffle, creating node: "; V.dump();); |
| |
| return V; |
| } |
| |
| // check if an EXT instruction can handle the shuffle mask when the |
| // vector sources of the shuffle are the same. |
| static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| // Assume that the first shuffle index is not UNDEF. Fail if it is. |
| if (M[0] < 0) |
| return false; |
| |
| Imm = M[0]; |
| |
| // If this is a VEXT shuffle, the immediate value is the index of the first |
| // element. The other shuffle indices must be the successive elements after |
| // the first one. |
| unsigned ExpectedElt = Imm; |
| for (unsigned i = 1; i < NumElts; ++i) { |
| // Increment the expected index. If it wraps around, just follow it |
| // back to index zero and keep going. |
| ++ExpectedElt; |
| if (ExpectedElt == NumElts) |
| ExpectedElt = 0; |
| |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if (ExpectedElt != static_cast<unsigned>(M[i])) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /// Check if a vector shuffle corresponds to a DUP instructions with a larger |
| /// element width than the vector lane type. If that is the case the function |
| /// returns true and writes the value of the DUP instruction lane operand into |
| /// DupLaneOp |
| static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, |
| unsigned &DupLaneOp) { |
| assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && |
| "Only possible block sizes for wide DUP are: 16, 32, 64"); |
| |
| if (BlockSize <= VT.getScalarSizeInBits()) |
| return false; |
| if (BlockSize % VT.getScalarSizeInBits() != 0) |
| return false; |
| if (VT.getSizeInBits() % BlockSize != 0) |
| return false; |
| |
| size_t SingleVecNumElements = VT.getVectorNumElements(); |
| size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); |
| size_t NumBlocks = VT.getSizeInBits() / BlockSize; |
| |
| // We are looking for masks like |
| // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element |
| // might be replaced by 'undefined'. BlockIndices will eventually contain |
| // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] |
| // for the above examples) |
| SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); |
| for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) |
| for (size_t I = 0; I < NumEltsPerBlock; I++) { |
| int Elt = M[BlockIndex * NumEltsPerBlock + I]; |
| if (Elt < 0) |
| continue; |
| // For now we don't support shuffles that use the second operand |
| if ((unsigned)Elt >= SingleVecNumElements) |
| return false; |
| if (BlockElts[I] < 0) |
| BlockElts[I] = Elt; |
| else if (BlockElts[I] != Elt) |
| return false; |
| } |
| |
| // We found a candidate block (possibly with some undefs). It must be a |
| // sequence of consecutive integers starting with a value divisible by |
| // NumEltsPerBlock with some values possibly replaced by undef-s. |
| |
| // Find first non-undef element |
| auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); |
| assert(FirstRealEltIter != BlockElts.end() && |
| "Shuffle with all-undefs must have been caught by previous cases, " |
| "e.g. isSplat()"); |
| if (FirstRealEltIter == BlockElts.end()) { |
| DupLaneOp = 0; |
| return true; |
| } |
| |
| // Index of FirstRealElt in BlockElts |
| size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); |
| |
| if ((unsigned)*FirstRealEltIter < FirstRealIndex) |
| return false; |
| // BlockElts[0] must have the following value if it isn't undef: |
| size_t Elt0 = *FirstRealEltIter - FirstRealIndex; |
| |
| // Check the first element |
| if (Elt0 % NumEltsPerBlock != 0) |
| return false; |
| // Check that the sequence indeed consists of consecutive integers (modulo |
| // undefs) |
| for (size_t I = 0; I < NumEltsPerBlock; I++) |
| if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) |
| return false; |
| |
| DupLaneOp = Elt0 / NumEltsPerBlock; |
| return true; |
| } |
| |
| // check if an EXT instruction can handle the shuffle mask when the |
| // vector sources of the shuffle are different. |
| static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, |
| unsigned &Imm) { |
| // Look for the first non-undef element. |
| const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); |
| |
| // Benefit form APInt to handle overflow when calculating expected element. |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); |
| APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); |
| // The following shuffle indices must be the successive elements after the |
| // first real element. |
| const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), |
| [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); |
| if (FirstWrongElt != M.end()) |
| return false; |
| |
| // The index of an EXT is the first element if it is not UNDEF. |
| // Watch out for the beginning UNDEFs. The EXT index should be the expected |
| // value of the first element. E.g. |
| // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. |
| // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. |
| // ExpectedElt is the last mask index plus 1. |
| Imm = ExpectedElt.getZExtValue(); |
| |
| // There are two difference cases requiring to reverse input vectors. |
| // For example, for vector <4 x i32> we have the following cases, |
| // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) |
| // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) |
| // For both cases, we finally use mask <5, 6, 7, 0>, which requires |
| // to reverse two input vectors. |
| if (Imm < NumElts) |
| ReverseEXT = true; |
| else |
| Imm -= NumElts; |
| |
| return true; |
| } |
| |
| /// isREVMask - Check if a vector shuffle corresponds to a REV |
| /// instruction with the specified blocksize. (The order of the elements |
| /// within each block of the vector is reversed.) |
| static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { |
| assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && |
| "Only possible block sizes for REV are: 16, 32, 64"); |
| |
| unsigned EltSz = VT.getScalarSizeInBits(); |
| if (EltSz == 64) |
| return false; |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned BlockElts = M[0] + 1; |
| // If the first shuffle index is UNDEF, be optimistic. |
| if (M[0] < 0) |
| BlockElts = BlockSize / EltSz; |
| |
| if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) |
| return false; |
| |
| for (unsigned i = 0; i < NumElts; ++i) { |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| unsigned Idx = WhichResult * NumElts / 2; |
| for (unsigned i = 0; i != NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != Idx) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) |
| return false; |
| Idx += 1; |
| } |
| |
| return true; |
| } |
| |
| static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i != NumElts; ++i) { |
| if (M[i] < 0) |
| continue; // ignore UNDEF indices |
| if ((unsigned)M[i] != 2 * i + WhichResult) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) |
| return false; |
| } |
| return true; |
| } |
| |
| /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. |
| static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| unsigned Idx = WhichResult * NumElts / 2; |
| for (unsigned i = 0; i != NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != Idx) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) |
| return false; |
| Idx += 1; |
| } |
| |
| return true; |
| } |
| |
| /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, |
| static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned Half = VT.getVectorNumElements() / 2; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned j = 0; j != 2; ++j) { |
| unsigned Idx = WhichResult; |
| for (unsigned i = 0; i != Half; ++i) { |
| int MIdx = M[i + j * Half]; |
| if (MIdx >= 0 && (unsigned)MIdx != Idx) |
| return false; |
| Idx += 2; |
| } |
| } |
| |
| return true; |
| } |
| |
| /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of |
| /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". |
| /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. |
| static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| WhichResult = (M[0] == 0 ? 0 : 1); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || |
| (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) |
| return false; |
| } |
| return true; |
| } |
| |
| static bool isINSMask(ArrayRef<int> M, int NumInputElements, |
| bool &DstIsLeft, int &Anomaly) { |
| if (M.size() != static_cast<size_t>(NumInputElements)) |
| return false; |
| |
| int NumLHSMatch = 0, NumRHSMatch = 0; |
| int LastLHSMismatch = -1, LastRHSMismatch = -1; |
| |
| for (int i = 0; i < NumInputElements; ++i) { |
| if (M[i] == -1) { |
| ++NumLHSMatch; |
| ++NumRHSMatch; |
| continue; |
| } |
| |
| if (M[i] == i) |
| ++NumLHSMatch; |
| else |
| LastLHSMismatch = i; |
| |
| if (M[i] == i + NumInputElements) |
| ++NumRHSMatch; |
| else |
| LastRHSMismatch = i; |
| } |
| |
| if (NumLHSMatch == NumInputElements - 1) { |
| DstIsLeft = true; |
| Anomaly = LastLHSMismatch; |
| return true; |
| } else if (NumRHSMatch == NumInputElements - 1) { |
| DstIsLeft = false; |
| Anomaly = LastRHSMismatch; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { |
| if (VT.getSizeInBits() != 128) |
| return false; |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| |
| for (int I = 0, E = NumElts / 2; I != E; I++) { |
| if (Mask[I] != I) |
| return false; |
| } |
| |
| int Offset = NumElts / 2; |
| for (int I = NumElts / 2, E = NumElts; I != E; I++) { |
| if (Mask[I] != I + SplitLHS * Offset) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| SDValue V0 = Op.getOperand(0); |
| SDValue V1 = Op.getOperand(1); |
| ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); |
| |
| if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || |
| VT.getVectorElementType() != V1.getValueType().getVectorElementType()) |
| return SDValue(); |
| |
| bool SplitV0 = V0.getValueSizeInBits() == 128; |
| |
| if (!isConcatMask(Mask, VT, SplitV0)) |
| return SDValue(); |
| |
| EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); |
| if (SplitV0) { |
| V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| if (V1.getValueSizeInBits() == 128) { |
| V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); |
| } |
| |
| /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit |
| /// the specified operations to build the shuffle. |
| static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, |
| SDValue RHS, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| unsigned OpNum = (PFEntry >> 26) & 0x0F; |
| unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); |
| unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); |
| |
| enum { |
| OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> |
| OP_VREV, |
| OP_VDUP0, |
| OP_VDUP1, |
| OP_VDUP2, |
| OP_VDUP3, |
| OP_VEXT1, |
| OP_VEXT2, |
| OP_VEXT3, |
| OP_VUZPL, // VUZP, left result |
| OP_VUZPR, // VUZP, right result |
| OP_VZIPL, // VZIP, left result |
| OP_VZIPR, // VZIP, right result |
| OP_VTRNL, // VTRN, left result |
| OP_VTRNR // VTRN, right result |
| }; |
| |
| if (OpNum == OP_COPY) { |
| if (LHSID == (1 * 9 + 2) * 9 + 3) |
| return LHS; |
| assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); |
| return RHS; |
| } |
| |
| SDValue OpLHS, OpRHS; |
| OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); |
| OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); |
| EVT VT = OpLHS.getValueType(); |
| |
| switch (OpNum) { |
| default: |
| llvm_unreachable("Unknown shuffle opcode!"); |
| case OP_VREV: |
| // VREV divides the vector in half and swaps within the half. |
| if (VT.getVectorElementType() == MVT::i32 || |
| VT.getVectorElementType() == MVT::f32) |
| return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); |
| // vrev <4 x i16> -> REV32 |
| if (VT.getVectorElementType() == MVT::i16 || |
| VT.getVectorElementType() == MVT::f16 || |
| VT.getVectorElementType() == MVT::bf16) |
| return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); |
| // vrev <4 x i8> -> REV16 |
| assert(VT.getVectorElementType() == MVT::i8); |
| return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); |
| case OP_VDUP0: |
| case OP_VDUP1: |
| case OP_VDUP2: |
| case OP_VDUP3: { |
| EVT EltTy = VT.getVectorElementType(); |
| unsigned Opcode; |
| if (EltTy == MVT::i8) |
| Opcode = AArch64ISD::DUPLANE8; |
| else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) |
| Opcode = AArch64ISD::DUPLANE16; |
| else if (EltTy == MVT::i32 || EltTy == MVT::f32) |
| Opcode = AArch64ISD::DUPLANE32; |
| else if (EltTy == MVT::i64 || EltTy == MVT::f64) |
| Opcode = AArch64ISD::DUPLANE64; |
| else |
| llvm_unreachable("Invalid vector element type?"); |
| |
| if (VT.getSizeInBits() == 64) |
| OpLHS = WidenVector(OpLHS, DAG); |
| SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); |
| return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); |
| } |
| case OP_VEXT1: |
| case OP_VEXT2: |
| case OP_VEXT3: { |
| unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); |
| return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } |
| case OP_VUZPL: |
| return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| case OP_VUZPR: |
| return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| case OP_VZIPL: |
| return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| case OP_VZIPR: |
| return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| case OP_VTRNL: |
| return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| case OP_VTRNR: |
| return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, |
| OpRHS); |
| } |
| } |
| |
| static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, |
| SelectionDAG &DAG) { |
| // Check to see if we can use the TBL instruction. |
| SDValue V1 = Op.getOperand(0); |
| SDValue V2 = Op.getOperand(1); |
| SDLoc DL(Op); |
| |
| EVT EltVT = Op.getValueType().getVectorElementType(); |
| unsigned BytesPerElt = EltVT.getSizeInBits() / 8; |
| |
| SmallVector<SDValue, 8> TBLMask; |
| for (int Val : ShuffleMask) { |
| for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { |
| unsigned Offset = Byte + Val * BytesPerElt; |
| TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); |
| } |
| } |
| |
| MVT IndexVT = MVT::v8i8; |
| unsigned IndexLen = 8; |
| if (Op.getValueSizeInBits() == 128) { |
| IndexVT = MVT::v16i8; |
| IndexLen = 16; |
| } |
| |
| SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); |
| SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); |
| |
| SDValue Shuffle; |
| if (V2.getNode()->isUndef()) { |
| if (IndexLen == 8) |
| V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, |
| DAG.getBuildVector(IndexVT, DL, |
| makeArrayRef(TBLMask.data(), IndexLen))); |
| } else { |
| if (IndexLen == 8) { |
| V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, |
| DAG.getBuildVector(IndexVT, DL, |
| makeArrayRef(TBLMask.data(), IndexLen))); |
| } else { |
| // FIXME: We cannot, for the moment, emit a TBL2 instruction because we |
| // cannot currently represent the register constraints on the input |
| // table registers. |
| // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, |
| // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], |
| // IndexLen)); |
| Shuffle = DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, |
| DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, |
| V2Cst, DAG.getBuildVector(IndexVT, DL, |
| makeArrayRef(TBLMask.data(), IndexLen))); |
| } |
| } |
| return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); |
| } |
| |
| static unsigned getDUPLANEOp(EVT EltType) { |
| if (EltType == MVT::i8) |
| return AArch64ISD::DUPLANE8; |
| if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) |
| return AArch64ISD::DUPLANE16; |
| if (EltType == MVT::i32 || EltType == MVT::f32) |
| return AArch64ISD::DUPLANE32; |
| if (EltType == MVT::i64 || EltType == MVT::f64) |
| return AArch64ISD::DUPLANE64; |
| |
| llvm_unreachable("Invalid vector element type?"); |
| } |
| |
| static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, |
| unsigned Opcode, SelectionDAG &DAG) { |
| // Try to eliminate a bitcasted extract subvector before a DUPLANE. |
| auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { |
| // Match: dup (bitcast (extract_subv X, C)), LaneC |
| if (BitCast.getOpcode() != ISD::BITCAST || |
| BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) |
| return false; |
| |
| // The extract index must align in the destination type. That may not |
| // happen if the bitcast is from narrow to wide type. |
| SDValue Extract = BitCast.getOperand(0); |
| unsigned ExtIdx = Extract.getConstantOperandVal(1); |
| unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); |
| unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; |
| unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); |
| if (ExtIdxInBits % CastedEltBitWidth != 0) |
| return false; |
| |
| // Update the lane value by offsetting with the scaled extract index. |
| LaneC += ExtIdxInBits / CastedEltBitWidth; |
| |
| // Determine the casted vector type of the wide vector input. |
| // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' |
| // Examples: |
| // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 |
| // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 |
| unsigned SrcVecNumElts = |
| Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; |
| CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), |
| SrcVecNumElts); |
| return true; |
| }; |
| MVT CastVT; |
| if (getScaledOffsetDup(V, Lane, CastVT)) { |
| V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); |
| } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { |
| // The lane is incremented by the index of the extract. |
| // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 |
| auto VecVT = V.getOperand(0).getValueType(); |
| if (VecVT.isFixedLengthVector() && VecVT.getFixedSizeInBits() <= 128) { |
| Lane += V.getConstantOperandVal(1); |
| V = V.getOperand(0); |
| } |
| } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { |
| // The lane is decremented if we are splatting from the 2nd operand. |
| // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 |
| unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; |
| Lane -= Idx * VT.getVectorNumElements() / 2; |
| V = WidenVector(V.getOperand(Idx), DAG); |
| } else if (VT.getSizeInBits() == 64) { |
| // Widen the operand to 128-bit register with undef. |
| V = WidenVector(V, DAG); |
| } |
| return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); |
| } |
| |
| // Return true if we can get a new shuffle mask by checking the parameter mask |
| // array to test whether every two adjacent mask values are continuous and |
| // starting from an even number. |
| static bool isWideTypeMask(ArrayRef<int> M, EVT VT, |
| SmallVectorImpl<int> &NewMask) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts % 2 != 0) |
| return false; |
| |
| NewMask.clear(); |
| for (unsigned i = 0; i < NumElts; i += 2) { |
| int M0 = M[i]; |
| int M1 = M[i + 1]; |
| |
| // If both elements are undef, new mask is undef too. |
| if (M0 == -1 && M1 == -1) { |
| NewMask.push_back(-1); |
| continue; |
| } |
| |
| if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { |
| NewMask.push_back(M1 / 2); |
| continue; |
| } |
| |
| if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { |
| NewMask.push_back(M0 / 2); |
| continue; |
| } |
| |
| NewMask.clear(); |
| return false; |
| } |
| |
| assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); |
| return true; |
| } |
| |
| // Try to widen element type to get a new mask value for a better permutation |
| // sequence, so that we can use NEON shuffle instructions, such as zip1/2, |
| // UZP1/2, TRN1/2, REV, INS, etc. |
| // For example: |
| // shufflevector <4 x i32> %a, <4 x i32> %b, |
| // <4 x i32> <i32 6, i32 7, i32 2, i32 3> |
| // is equivalent to: |
| // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> |
| // Finally, we can get: |
| // mov v0.d[0], v1.d[1] |
| static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| EVT ScalarVT = VT.getVectorElementType(); |
| unsigned ElementSize = ScalarVT.getFixedSizeInBits(); |
| SDValue V0 = Op.getOperand(0); |
| SDValue V1 = Op.getOperand(1); |
| ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); |
| |
| // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... |
| // We need to make sure the wider element type is legal. Thus, ElementSize |
| // should be not larger than 32 bits, and i1 type should also be excluded. |
| if (ElementSize > 32 || ElementSize == 1) |
| return SDValue(); |
| |
| SmallVector<int, 8> NewMask; |
| if (isWideTypeMask(Mask, VT, NewMask)) { |
| MVT NewEltVT = VT.isFloatingPoint() |
| ? MVT::getFloatingPointVT(ElementSize * 2) |
| : MVT::getIntegerVT(ElementSize * 2); |
| MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); |
| if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { |
| V0 = DAG.getBitcast(NewVT, V0); |
| V1 = DAG.getBitcast(NewVT, V1); |
| return DAG.getBitcast(VT, |
| DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| |
| ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); |
| |
| if (useSVEForFixedLengthVectorVT(VT)) |
| return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); |
| |
| // Convert shuffles that are directly supported on NEON to target-specific |
| // DAG nodes, instead of keeping them as shuffles and matching them again |
| // during code selection. This is more efficient and avoids the possibility |
| // of inconsistencies between legalization and selection. |
| ArrayRef<int> ShuffleMask = SVN->getMask(); |
| |
| SDValue V1 = Op.getOperand(0); |
| SDValue V2 = Op.getOperand(1); |
| |
| assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); |
| assert(ShuffleMask.size() == VT.getVectorNumElements() && |
| "Unexpected VECTOR_SHUFFLE mask size!"); |
| |
| if (SVN->isSplat()) { |
| int Lane = SVN->getSplatIndex(); |
| // If this is undef splat, generate it via "just" vdup, if possible. |
| if (Lane == -1) |
| Lane = 0; |
| |
| if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) |
| return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), |
| V1.getOperand(0)); |
| // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- |
| // constant. If so, we can just reference the lane's definition directly. |
| if (V1.getOpcode() == ISD::BUILD_VECTOR && |
| !isa<ConstantSDNode>(V1.getOperand(Lane))) |
| return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); |
| |
| // Otherwise, duplicate from the lane of the input vector. |
| unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); |
| return constructDup(V1, Lane, dl, VT, Opcode, DAG); |
| } |
| |
| // Check if the mask matches a DUP for a wider element |
| for (unsigned LaneSize : {64U, 32U, 16U}) { |
| unsigned Lane = 0; |
| if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { |
| unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 |
| : LaneSize == 32 ? AArch64ISD::DUPLANE32 |
| : AArch64ISD::DUPLANE16; |
| // Cast V1 to an integer vector with required lane size |
| MVT NewEltTy = MVT::getIntegerVT(LaneSize); |
| unsigned NewEltCount = VT.getSizeInBits() / LaneSize; |
| MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); |
| V1 = DAG.getBitcast(NewVecTy, V1); |
| // Constuct the DUP instruction |
| V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); |
| // Cast back to the original type |
| return DAG.getBitcast(VT, V1); |
| } |
| } |
| |
| if (isREVMask(ShuffleMask, VT, 64)) |
| return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); |
| if (isREVMask(ShuffleMask, VT, 32)) |
| return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); |
| if (isREVMask(ShuffleMask, VT, 16)) |
| return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); |
| |
| if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || |
| (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && |
| ShuffleVectorInst::isReverseMask(ShuffleMask)) { |
| SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, |
| DAG.getConstant(8, dl, MVT::i32)); |
| } |
| |
| bool ReverseEXT = false; |
| unsigned Imm; |
| if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { |
| if (ReverseEXT) |
| std::swap(V1, V2); |
| Imm *= getExtFactor(V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { |
| Imm *= getExtFactor(V1); |
| return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, |
| DAG.getConstant(Imm, dl, MVT::i32)); |
| } |
| |
| unsigned WhichResult; |
| if (isZIPMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| if (isUZPMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| if (isTRNMask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); |
| } |
| |
| if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { |
| unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; |
| return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); |
| } |
| |
| if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) |
| return Concat; |
| |
| bool DstIsLeft; |
| int Anomaly; |
| int NumInputElements = V1.getValueType().getVectorNumElements(); |
| if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { |
| SDValue DstVec = DstIsLeft ? V1 : V2; |
| SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); |
| |
| SDValue SrcVec = V1; |
| int SrcLane = ShuffleMask[Anomaly]; |
| if (SrcLane >= NumInputElements) { |
| SrcVec = V2; |
| SrcLane -= VT.getVectorNumElements(); |
| } |
| SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); |
| |
| EVT ScalarVT = VT.getVectorElementType(); |
| |
| if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) |
| ScalarVT = MVT::i32; |
| |
| return DAG.getNode( |
| ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, |
| DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), |
| DstLaneV); |
| } |
| |
| if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) |
| return NewSD; |
| |
| // If the shuffle is not directly supported and it has 4 elements, use |
| // the PerfectShuffle-generated table to synthesize it from other shuffles. |
| unsigned NumElts = VT.getVectorNumElements(); |
| if (NumElts == 4) { |
| unsigned PFIndexes[4]; |
| for (unsigned i = 0; i != 4; ++i) { |
| if (ShuffleMask[i] < 0) |
| PFIndexes[i] = 8; |
| else |
| PFIndexes[i] = ShuffleMask[i]; |
| } |
| |
| // Compute the index in the perfect shuffle table. |
| unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + |
| PFIndexes[2] * 9 + PFIndexes[3]; |
| unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; |
| unsigned Cost = (PFEntry >> 30); |
| |
| if (Cost <= 4) |
| return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); |
| } |
| |
| return GenerateTBL(Op, ShuffleMask, DAG); |
| } |
| |
| SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| EVT ElemVT = VT.getScalarType(); |
| SDValue SplatVal = Op.getOperand(0); |
| |
| if (useSVEForFixedLengthVectorVT(VT)) |
| return LowerToScalableOp(Op, DAG); |
| |
| // Extend input splat value where needed to fit into a GPR (32b or 64b only) |
| // FPRs don't have this restriction. |
| switch (ElemVT.getSimpleVT().SimpleTy) { |
| case MVT::i1: { |
| // The only legal i1 vectors are SVE vectors, so we can use SVE-specific |
| // lowering code. |
| if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { |
| if (ConstVal->isZero()) |
| return SDValue(DAG.getMachineNode(AArch64::PFALSE, dl, VT), 0); |
| if (ConstVal->isOne()) |
| return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); |
| } |
| // The general case of i1. There isn't any natural way to do this, |
| // so we use some trickery with whilelo. |
| SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); |
| SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, |
| DAG.getValueType(MVT::i1)); |
| SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, |
| MVT::i64); |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, |
| DAG.getConstant(0, dl, MVT::i64), SplatVal); |
| } |
| case MVT::i8: |
| case MVT::i16: |
| case MVT::i32: |
| SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); |
| break; |
| case MVT::i64: |
| SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); |
| break; |
| case MVT::f16: |
| case MVT::bf16: |
| case MVT::f32: |
| case MVT::f64: |
| // Fine as is |
| break; |
| default: |
| report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); |
| } |
| |
| return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); |
| } |
| |
| SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| |
| EVT VT = Op.getValueType(); |
| if (!isTypeLegal(VT) || !VT.isScalableVector()) |
| return SDValue(); |
| |
| // Current lowering only supports the SVE-ACLE types. |
| if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) |
| return SDValue(); |
| |
| // The DUPQ operation is indepedent of element type so normalise to i64s. |
| SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); |
| SDValue Idx128 = Op.getOperand(2); |
| |
| // DUPQ can be used when idx is in range. |
| auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); |
| if (CIdx && (CIdx->getZExtValue() <= 3)) { |
| SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); |
| SDNode *DUPQ = |
| DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); |
| return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); |
| } |
| |
| // The ACLE says this must produce the same result as: |
| // svtbl(data, svadd_x(svptrue_b64(), |
| // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), |
| // index * 2)) |
| SDValue One = DAG.getConstant(1, DL, MVT::i64); |
| SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); |
| |
| // create the vector 0,1,0,1,... |
| SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); |
| SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); |
| |
| // create the vector idx64,idx64+1,idx64,idx64+1,... |
| SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); |
| SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); |
| SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); |
| |
| // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... |
| SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); |
| return DAG.getNode(ISD::BITCAST, DL, VT, TBL); |
| } |
| |
| |
| static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, |
| APInt &UndefBits) { |
| EVT VT = BVN->getValueType(0); |
| APInt SplatBits, SplatUndef; |
| unsigned SplatBitSize; |
| bool HasAnyUndefs; |
| if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { |
| unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; |
| |
| for (unsigned i = 0; i < NumSplats; ++i) { |
| CnstBits <<= SplatBitSize; |
| UndefBits <<= SplatBitSize; |
| CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); |
| UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); |
| } |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // Try 64-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; |
| |
| if (AArch64_AM::isAdvSIMDModImmType10(Value)) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); |
| |
| SDLoc dl(Op); |
| SDValue Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32)); |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 32-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits, |
| const SDValue *LHS = nullptr) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; |
| bool isAdvSIMDModImm = false; |
| uint64_t Shift; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); |
| Shift = 0; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); |
| Shift = 8; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); |
| Shift = 16; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); |
| Shift = 24; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov; |
| |
| if (LHS) |
| Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| else |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 16-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits, |
| const SDValue *LHS = nullptr) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; |
| bool isAdvSIMDModImm = false; |
| uint64_t Shift; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); |
| Shift = 0; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); |
| Shift = 8; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov; |
| |
| if (LHS) |
| Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| else |
| Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 32-bit splatted SIMD immediate with shifted ones. |
| static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, |
| SelectionDAG &DAG, const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; |
| bool isAdvSIMDModImm = false; |
| uint64_t Shift; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); |
| Shift = 264; |
| } |
| else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); |
| Shift = 272; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32), |
| DAG.getConstant(Shift, dl, MVT::i32)); |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try 8-bit splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; |
| |
| if (AArch64_AM::isAdvSIMDModImmType9(Value)) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); |
| |
| SDLoc dl(Op); |
| SDValue Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32)); |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Try FP splatted SIMD immediate. |
| static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, |
| const APInt &Bits) { |
| if (Bits.getHiBits(64) == Bits.getLoBits(64)) { |
| uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); |
| EVT VT = Op.getValueType(); |
| bool isWide = (VT.getSizeInBits() == 128); |
| MVT MovTy; |
| bool isAdvSIMDModImm = false; |
| |
| if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); |
| MovTy = isWide ? MVT::v4f32 : MVT::v2f32; |
| } |
| else if (isWide && |
| (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { |
| Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); |
| MovTy = MVT::v2f64; |
| } |
| |
| if (isAdvSIMDModImm) { |
| SDLoc dl(Op); |
| SDValue Mov = DAG.getNode(NewOp, dl, MovTy, |
| DAG.getConstant(Value, dl, MVT::i32)); |
| return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Specialized code to quickly find if PotentialBVec is a BuildVector that |
| // consists of only the same constant int value, returned in reference arg |
| // ConstVal |
| static bool isAllConstantBuildVector(const SDValue &PotentialBVec, |
| uint64_t &ConstVal) { |
| BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); |
| if (!Bvec) |
| return false; |
| ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); |
| if (!FirstElt) |
| return false; |
| EVT VT = Bvec->getValueType(0); |
| unsigned NumElts = VT.getVectorNumElements(); |
| for (unsigned i = 1; i < NumElts; ++i) |
| if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) |
| return false; |
| ConstVal = FirstElt->getZExtValue(); |
| return true; |
| } |
| |
| static unsigned getIntrinsicID(const SDNode *N) { |
| unsigned Opcode = N->getOpcode(); |
| switch (Opcode) { |
| default: |
| return Intrinsic::not_intrinsic; |
| case ISD::INTRINSIC_WO_CHAIN: { |
| unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); |
| if (IID < Intrinsic::num_intrinsics) |
| return IID; |
| return Intrinsic::not_intrinsic; |
| } |
| } |
| } |
| |
| // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), |
| // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a |
| // BUILD_VECTORs with constant element C1, C2 is a constant, and: |
| // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) |
| // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) |
| // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. |
| static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { |
| EVT VT = N->getValueType(0); |
| |
| if (!VT.isVector()) |
| return SDValue(); |
| |
| SDLoc DL(N); |
| |
| SDValue And; |
| SDValue Shift; |
| |
| SDValue FirstOp = N->getOperand(0); |
| unsigned FirstOpc = FirstOp.getOpcode(); |
| SDValue SecondOp = N->getOperand(1); |
| unsigned SecondOpc = SecondOp.getOpcode(); |
| |
| // Is one of the operands an AND or a BICi? The AND may have been optimised to |
| // a BICi in order to use an immediate instead of a register. |
| // Is the other operand an shl or lshr? This will have been turned into: |
| // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. |
| if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && |
| (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { |
| And = FirstOp; |
| Shift = SecondOp; |
| |
| } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && |
| (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { |
| And = SecondOp; |
| Shift = FirstOp; |
| } else |
| return SDValue(); |
| |
| bool IsAnd = And.getOpcode() == ISD::AND; |
| bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; |
| |
| // Is the shift amount constant? |
| ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); |
| if (!C2node) |
| return SDValue(); |
| |
| uint64_t C1; |
| if (IsAnd) { |
| // Is the and mask vector all constant? |
| if (!isAllConstantBuildVector(And.getOperand(1), C1)) |
| return SDValue(); |
| } else { |
| // Reconstruct the corresponding AND immediate from the two BICi immediates. |
| ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); |
| ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); |
| assert(C1nodeImm && C1nodeShift); |
| C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); |
| } |
| |
| // Is C1 == ~(Ones(ElemSizeInBits) << C2) or |
| // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account |
| // how much one can shift elements of a particular size? |
| uint64_t C2 = C2node->getZExtValue(); |
| unsigned ElemSizeInBits = VT.getScalarSizeInBits(); |
| if (C2 > ElemSizeInBits) |
| return SDValue(); |
| |
| APInt C1AsAPInt(ElemSizeInBits, C1); |
| APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) |
| : APInt::getLowBitsSet(ElemSizeInBits, C2); |
| if (C1AsAPInt != RequiredC1) |
| return SDValue(); |
| |
| SDValue X = And.getOperand(0); |
| SDValue Y = Shift.getOperand(0); |
| |
| unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; |
| SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); |
| |
| LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); |
| LLVM_DEBUG(N->dump(&DAG)); |
| LLVM_DEBUG(dbgs() << "into: \n"); |
| LLVM_DEBUG(ResultSLI->dump(&DAG)); |
| |
| ++NumShiftInserts; |
| return ResultSLI; |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (useSVEForFixedLengthVectorVT(Op.getValueType())) |
| return LowerToScalableOp(Op, DAG); |
| |
| // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) |
| if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) |
| return Res; |
| |
| EVT VT = Op.getValueType(); |
| |
| SDValue LHS = Op.getOperand(0); |
| BuildVectorSDNode *BVN = |
| dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); |
| if (!BVN) { |
| // OR commutes, so try swapping the operands. |
| LHS = Op.getOperand(1); |
| BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); |
| } |
| if (!BVN) |
| return Op; |
| |
| APInt DefBits(VT.getSizeInBits(), 0); |
| APInt UndefBits(VT.getSizeInBits(), 0); |
| if (resolveBuildVector(BVN, DefBits, UndefBits)) { |
| SDValue NewOp; |
| |
| if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, |
| DefBits, &LHS)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, |
| DefBits, &LHS))) |
| return NewOp; |
| |
| if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, |
| UndefBits, &LHS)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, |
| UndefBits, &LHS))) |
| return NewOp; |
| } |
| |
| // We can always fall back to a non-immediate OR. |
| return Op; |
| } |
| |
| // Normalize the operands of BUILD_VECTOR. The value of constant operands will |
| // be truncated to fit element width. |
| static SDValue NormalizeBuildVector(SDValue Op, |
| SelectionDAG &DAG) { |
| assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); |
| SDLoc dl(Op); |
| EVT VT = Op.getValueType(); |
| EVT EltTy= VT.getVectorElementType(); |
| |
| if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) |
| return Op; |
| |
| SmallVector<SDValue, 16> Ops; |
| for (SDValue Lane : Op->ops()) { |
| // For integer vectors, type legalization would have promoted the |
| // operands already. Otherwise, if Op is a floating-point splat |
| // (with operands cast to integers), then the only possibilities |
| // are constants and UNDEFs. |
| if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { |
| APInt LowBits(EltTy.getSizeInBits(), |
| CstLane->getZExtValue()); |
| Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); |
| } else if (Lane.getNode()->isUndef()) { |
| Lane = DAG.getUNDEF(MVT::i32); |
| } else { |
| assert(Lane.getValueType() == MVT::i32 && |
| "Unexpected BUILD_VECTOR operand type"); |
| } |
| Ops.push_back(Lane); |
| } |
| return DAG.getBuildVector(VT, dl, Ops); |
| } |
| |
| static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { |
| EVT VT = Op.getValueType(); |
| |
| APInt DefBits(VT.getSizeInBits(), 0); |
| APInt UndefBits(VT.getSizeInBits(), 0); |
| BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); |
| if (resolveBuildVector(BVN, DefBits, UndefBits)) { |
| SDValue NewOp; |
| if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) |
| return NewOp; |
| |
| DefBits = ~DefBits; |
| if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) |
| return NewOp; |
| |
| DefBits = UndefBits; |
| if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) |
| return NewOp; |
| |
| DefBits = ~UndefBits; |
| if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || |
| (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) |
| return NewOp; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| // Try to build a simple constant vector. |
| Op = NormalizeBuildVector(Op, DAG); |
| if (VT.isInteger()) { |
| // Certain vector constants, used to express things like logical NOT and |
| // arithmetic NEG, are passed through unmodified. This allows special |
| // patterns for these operations to match, which will lower these constants |
| // to whatever is proven necessary. |
| BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); |
| if (BVN->isConstant()) |
| if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { |
| unsigned BitSize = VT.getVectorElementType().getSizeInBits(); |
| APInt Val(BitSize, |
| Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); |
| if (Val.isZero() || Val.isAllOnes()) |
| return Op; |
| } |
| } |
| |
| if (SDValue V = ConstantBuildVector(Op, DAG)) |
| return V; |
| |
| // Scan through the operands to find some interesting properties we can |
| // exploit: |
| // 1) If only one value is used, we can use a DUP, or |
| // 2) if only the low element is not undef, we can just insert that, or |
| // 3) if only one constant value is used (w/ some non-constant lanes), |
| // we can splat the constant value into the whole vector then fill |
| // in the non-constant lanes. |
| // 4) FIXME: If different constant values are used, but we can intelligently |
| // select the values we'll be overwriting for the non-constant |
| // lanes such that we can directly materialize the vector |
| // some other way (MOVI, e.g.), we can be sneaky. |
| // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. |
| SDLoc dl(Op); |
| unsigned NumElts = VT.getVectorNumElements(); |
| bool isOnlyLowElement = true; |
| bool usesOnlyOneValue = true; |
| bool usesOnlyOneConstantValue = true; |
| bool isConstant = true; |
| bool AllLanesExtractElt = true; |
| unsigned NumConstantLanes = 0; |
| unsigned NumDifferentLanes = 0; |
| unsigned NumUndefLanes = 0; |
| SDValue Value; |
| SDValue ConstantValue; |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| AllLanesExtractElt = false; |
| if (V.isUndef()) { |
| ++NumUndefLanes; |
| continue; |
| } |
| if (i > 0) |
| isOnlyLowElement = false; |
| if (!isIntOrFPConstant(V)) |
| isConstant = false; |
| |
| if (isIntOrFPConstant(V)) { |
| ++NumConstantLanes; |
| if (!ConstantValue.getNode()) |
| ConstantValue = V; |
| else if (ConstantValue != V) |
| usesOnlyOneConstantValue = false; |
| } |
| |
| if (!Value.getNode()) |
| Value = V; |
| else if (V != Value) { |
| usesOnlyOneValue = false; |
| ++NumDifferentLanes; |
| } |
| } |
| |
| if (!Value.getNode()) { |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); |
| return DAG.getUNDEF(VT); |
| } |
| |
| // Convert BUILD_VECTOR where all elements but the lowest are undef into |
| // SCALAR_TO_VECTOR, except for when we have a single-element constant vector |
| // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. |
| if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { |
| LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " |
| "SCALAR_TO_VECTOR node\n"); |
| return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); |
| } |
| |
| if (AllLanesExtractElt) { |
| SDNode *Vector = nullptr; |
| bool Even = false; |
| bool Odd = false; |
| // Check whether the extract elements match the Even pattern <0,2,4,...> or |
| // the Odd pattern <1,3,5,...>. |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| const SDNode *N = V.getNode(); |
| if (!isa<ConstantSDNode>(N->getOperand(1))) |
| break; |
| SDValue N0 = N->getOperand(0); |
| |
| // All elements are extracted from the same vector. |
| if (!Vector) { |
| Vector = N0.getNode(); |
| // Check that the type of EXTRACT_VECTOR_ELT matches the type of |
| // BUILD_VECTOR. |
| if (VT.getVectorElementType() != |
| N0.getValueType().getVectorElementType()) |
| break; |
| } else if (Vector != N0.getNode()) { |
| Odd = false; |
| Even = false; |
| break; |
| } |
| |
| // Extracted values are either at Even indices <0,2,4,...> or at Odd |
| // indices <1,3,5,...>. |
| uint64_t Val = N->getConstantOperandVal(1); |
| if (Val == 2 * i) { |
| Even = true; |
| continue; |
| } |
| if (Val - 1 == 2 * i) { |
| Odd = true; |
| continue; |
| } |
| |
| // Something does not match: abort. |
| Odd = false; |
| Even = false; |
| break; |
| } |
| if (Even || Odd) { |
| SDValue LHS = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), |
| DAG.getConstant(0, dl, MVT::i64)); |
| SDValue RHS = |
| DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), |
| DAG.getConstant(NumElts, dl, MVT::i64)); |
| |
| if (Even && !Odd) |
| return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, |
| RHS); |
| if (Odd && !Even) |
| return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, |
| RHS); |
| } |
| } |
| |
| // Use DUP for non-constant splats. For f32 constant splats, reduce to |
| // i32 and try again. |
| if (usesOnlyOneValue) { |
| if (!isConstant) { |
| if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| Value.getValueType() != VT) { |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); |
| return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); |
| } |
| |
| // This is actually a DUPLANExx operation, which keeps everything vectory. |
| |
| SDValue Lane = Value.getOperand(1); |
| Value = Value.getOperand(0); |
| if (Value.getValueSizeInBits() == 64) { |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " |
| "widening it\n"); |
| Value = WidenVector(Value, DAG); |
| } |
| |
| unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); |
| return DAG.getNode(Opcode, dl, VT, Value, Lane); |
| } |
| |
| if (VT.getVectorElementType().isFloatingPoint()) { |
| SmallVector<SDValue, 8> Ops; |
| EVT EltTy = VT.getVectorElementType(); |
| assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || |
| EltTy == MVT::f64) && "Unsupported floating-point vector type"); |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " |
| "BITCASTS, and try again\n"); |
| MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); |
| for (unsigned i = 0; i < NumElts; ++i) |
| Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); |
| EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); |
| SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); |
| LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; |
| Val.dump();); |
| Val = LowerBUILD_VECTOR(Val, DAG); |
| if (Val.getNode()) |
| return DAG.getNode(ISD::BITCAST, dl, VT, Val); |
| } |
| } |
| |
| // If we need to insert a small number of different non-constant elements and |
| // the vector width is sufficiently large, prefer using DUP with the common |
| // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, |
| // skip the constant lane handling below. |
| bool PreferDUPAndInsert = |
| !isConstant && NumDifferentLanes >= 1 && |
| NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && |
| NumDifferentLanes >= NumConstantLanes; |
| |
| // If there was only one constant value used and for more than one lane, |
| // start by splatting that value, then replace the non-constant lanes. This |
| // is better than the default, which will perform a separate initialization |
| // for each lane. |
| if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { |
| // Firstly, try to materialize the splat constant. |
| SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), |
| Val = ConstantBuildVector(Vec, DAG); |
| if (!Val) { |
| // Otherwise, materialize the constant and splat it. |
| Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); |
| DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); |
| } |
| |
| // Now insert the non-constant lanes. |
| for (unsigned i = 0; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); |
| if (!isIntOrFPConstant(V)) |
| // Note that type legalization likely mucked about with the VT of the |
| // source operand, so we may have to convert it here before inserting. |
| Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); |
| } |
| return Val; |
| } |
| |
| // This will generate a load from the constant pool. |
| if (isConstant) { |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " |
| "expansion\n"); |
| return SDValue(); |
| } |
| |
| // Empirical tests suggest this is rarely worth it for vectors of length <= 2. |
| if (NumElts >= 4) { |
| if (SDValue shuffle = ReconstructShuffle(Op, DAG)) |
| return shuffle; |
| } |
| |
| if (PreferDUPAndInsert) { |
| // First, build a constant vector with the common element. |
| SmallVector<SDValue, 8> Ops(NumElts, Value); |
| SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); |
| // Next, insert the elements that do not match the common value. |
| for (unsigned I = 0; I < NumElts; ++I) |
| if (Op.getOperand(I) != Value) |
| NewVector = |
| DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, |
| Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); |
| |
| return NewVector; |
| } |
| |
| // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we |
| // know the default expansion would otherwise fall back on something even |
| // worse. For a vector with one or two non-undef values, that's |
| // scalar_to_vector for the elements followed by a shuffle (provided the |
| // shuffle is valid for the target) and materialization element by element |
| // on the stack followed by a load for everything else. |
| if (!isConstant && !usesOnlyOneValue) { |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " |
| "of INSERT_VECTOR_ELT\n"); |
| |
| SDValue Vec = DAG.getUNDEF(VT); |
| SDValue Op0 = Op.getOperand(0); |
| unsigned i = 0; |
| |
| // Use SCALAR_TO_VECTOR for lane zero to |
| // a) Avoid a RMW dependency on the full vector register, and |
| // b) Allow the register coalescer to fold away the copy if the |
| // value is already in an S or D register, and we're forced to emit an |
| // INSERT_SUBREG that we can't fold anywhere. |
| // |
| // We also allow types like i8 and i16 which are illegal scalar but legal |
| // vector element types. After type-legalization the inserted value is |
| // extended (i32) and it is safe to cast them to the vector type by ignoring |
| // the upper bits of the lowest lane (e.g. v8i8, v4i16). |
| if (!Op0.isUndef()) { |
| LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); |
| Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); |
| ++i; |
| } |
| LLVM_DEBUG(if (i < NumElts) dbgs() |
| << "Creating nodes for the other vector elements:\n";); |
| for (; i < NumElts; ++i) { |
| SDValue V = Op.getOperand(i); |
| if (V.isUndef()) |
| continue; |
| SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); |
| Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); |
| } |
| return Vec; |
| } |
| |
| LLVM_DEBUG( |
| dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " |
| "better alternative\n"); |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (useSVEForFixedLengthVectorVT(Op.getValueType())) |
| return LowerFixedLengthConcatVectorsToSVE(Op, DAG); |
| |
| assert(Op.getValueType().isScalableVector() && |
| isTypeLegal(Op.getValueType()) && |
| "Expected legal scalable vector type!"); |
| |
| if (isTypeLegal(Op.getOperand(0).getValueType())) { |
| unsigned NumOperands = Op->getNumOperands(); |
| assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && |
| "Unexpected number of operands in CONCAT_VECTORS"); |
| |
| if (NumOperands == 2) |
| return Op; |
| |
| // Concat each pair of subvectors and pack into the lower half of the array. |
| SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); |
| while (ConcatOps.size() > 1) { |
| for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { |
| SDValue V1 = ConcatOps[I]; |
| SDValue V2 = ConcatOps[I + 1]; |
| EVT SubVT = V1.getValueType(); |
| EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); |
| ConcatOps[I / 2] = |
| DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); |
| } |
| ConcatOps.resize(ConcatOps.size() / 2); |
| } |
| return ConcatOps[0]; |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); |
| |
| if (useSVEForFixedLengthVectorVT(Op.getValueType())) |
| return LowerFixedLengthInsertVectorElt(Op, DAG); |
| |
| // Check for non-constant or out of range lane. |
| EVT VT = Op.getOperand(0).getValueType(); |
| |
| if (VT.getScalarType() == MVT::i1) { |
| EVT VectorVT = getPromotedVTForPredicate(VT); |
| SDLoc DL(Op); |
| SDValue ExtendedVector = |
| DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT); |
| SDValue ExtendedValue = |
| DAG.getAnyExtOrTrunc(Op.getOperand(1), DL, |
| VectorVT.getScalarType().getSizeInBits() < 32 |
| ? MVT::i32 |
| : VectorVT.getScalarType()); |
| ExtendedVector = |
| DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector, |
| ExtendedValue, Op.getOperand(2)); |
| return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); |
| } |
| |
| ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); |
| if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) |
| return SDValue(); |
| |
| // Insertion/extraction are legal for V128 types. |
| if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || |
| VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || |
| VT == MVT::v8f16 || VT == MVT::v8bf16) |
| return Op; |
| |
| if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && |
| VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && |
| VT != MVT::v4bf16) |
| return SDValue(); |
| |
| // For V64 types, we perform insertion by expanding the value |
| // to a V128 type and perform the insertion on that. |
| SDLoc DL(Op); |
| SDValue WideVec = WidenVector(Op.getOperand(0), DAG); |
| EVT WideTy = WideVec.getValueType(); |
| |
| SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, |
| Op.getOperand(1), Op.getOperand(2)); |
| // Re-narrow the resultant vector. |
| return NarrowVector(Node, DAG); |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); |
| EVT VT = Op.getOperand(0).getValueType(); |
| |
| if (VT.getScalarType() == MVT::i1) { |
| // We can't directly extract from an SVE predicate; extend it first. |
| // (This isn't the only possible lowering, but it's straightforward.) |
| EVT VectorVT = getPromotedVTForPredicate(VT); |
| SDLoc DL(Op); |
| SDValue Extend = |
| DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0)); |
| MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32; |
| SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, |
| Extend, Op.getOperand(1)); |
| return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); |
| } |
| |
| if (useSVEForFixedLengthVectorVT(VT)) |
| return LowerFixedLengthExtractVectorElt(Op, DAG); |
| |
| // Check for non-constant or out of range lane. |
| ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); |
| if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) |
| return SDValue(); |
| |
| // Insertion/extraction are legal for V128 types. |
| if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || |
| VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || |
| VT == MVT::v8f16 || VT == MVT::v8bf16) |
| return Op; |
| |
| if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && |
| VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && |
| VT != MVT::v4bf16) |
| return SDValue(); |
| |
| // For V64 types, we perform extraction by expanding the value |
| // to a V128 type and perform the extraction on that. |
| SDLoc DL(Op); |
| SDValue WideVec = WidenVector(Op.getOperand(0), DAG); |
| EVT WideTy = WideVec.getValueType(); |
| |
| EVT ExtrTy = WideTy.getVectorElementType(); |
| if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) |
| ExtrTy = MVT::i32; |
| |
| // For extractions, we just return the result directly. |
| return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, |
| Op.getOperand(1)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getValueType().isFixedLengthVector() && |
| "Only cases that extract a fixed length vector are supported!"); |
| |
| EVT InVT = Op.getOperand(0).getValueType(); |
| unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); |
| unsigned Size = Op.getValueSizeInBits(); |
| |
| // If we don't have legal types yet, do nothing |
| if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) |
| return SDValue(); |
| |
| if (InVT.isScalableVector()) { |
| // This will be matched by custom code during ISelDAGToDAG. |
| if (Idx == 0 && isPackedVectorType(InVT, DAG)) |
| return Op; |
| |
| return SDValue(); |
| } |
| |
| // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. |
| if (Idx == 0 && InVT.getSizeInBits() <= 128) |
| return Op; |
| |
| // If this is extracting the upper 64-bits of a 128-bit vector, we match |
| // that directly. |
| if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && |
| InVT.getSizeInBits() == 128) |
| return Op; |
| |
| if (useSVEForFixedLengthVectorVT(InVT)) { |
| SDLoc DL(Op); |
| |
| EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); |
| SDValue NewInVec = |
| convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); |
| |
| SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec, |
| NewInVec, DAG.getConstant(Idx, DL, MVT::i64)); |
| return convertFromScalableVector(DAG, Op.getValueType(), Splice); |
| } |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Op.getValueType().isScalableVector() && |
| "Only expect to lower inserts into scalable vectors!"); |
| |
| EVT InVT = Op.getOperand(1).getValueType(); |
| unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); |
| |
| if (InVT.isScalableVector()) { |
| SDLoc DL(Op); |
| EVT VT = Op.getValueType(); |
| |
| if (!isTypeLegal(VT)) |
| return SDValue(); |
| |
| SDValue Vec0 = Op.getOperand(0); |
| SDValue Vec1 = Op.getOperand(1); |
| |
| // Ensure the subvector is half the size of the main vector. |
| if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) |
| return SDValue(); |
| |
| EVT WideVT; |
| SDValue ExtVec; |
| |
| if (VT.isFloatingPoint()) { |
| // The InVT type should be legal. We can safely cast the unpacked |
| // subvector from InVT -> VT. |
| WideVT = VT; |
| ExtVec = getSVESafeBitCast(VT, Vec1, DAG); |
| } else { |
| // Extend elements of smaller vector... |
| WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); |
| ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); |
| } |
| |
| if (Idx == 0) { |
| SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); |
| return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); |
| } else if (Idx == InVT.getVectorMinNumElements()) { |
| SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); |
| return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); |
| } |
| |
| return SDValue(); |
| } |
| |
| // This will be matched by custom code during ISelDAGToDAG. |
| if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) |
| return Op; |
| |
| return SDValue(); |
| } |
| |
| static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) { |
| if (Op.getOpcode() != AArch64ISD::DUP && |
| Op.getOpcode() != ISD::SPLAT_VECTOR && |
| Op.getOpcode() != ISD::BUILD_VECTOR) |
| return false; |
| |
| if (Op.getOpcode() == ISD::BUILD_VECTOR && |
| !isAllConstantBuildVector(Op, SplatVal)) |
| return false; |
| |
| if (Op.getOpcode() != ISD::BUILD_VECTOR && |
| !isa<ConstantSDNode>(Op->getOperand(0))) |
| return false; |
| |
| SplatVal = Op->getConstantOperandVal(0); |
| if (Op.getValueType().getVectorElementType() != MVT::i64) |
| SplatVal = (int32_t)SplatVal; |
| |
| Negated = false; |
| if (isPowerOf2_64(SplatVal)) |
| return true; |
| |
| Negated = true; |
| if (isPowerOf2_64(-SplatVal)) { |
| SplatVal = -SplatVal; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| SDLoc dl(Op); |
| |
| if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) |
| return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); |
| |
| assert(VT.isScalableVector() && "Expected a scalable vector."); |
| |
| bool Signed = Op.getOpcode() == ISD::SDIV; |
| unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; |
| |
| bool Negated; |
| uint64_t SplatVal; |
| if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { |
| SDValue Pg = getPredicateForScalableVector(DAG, dl, VT); |
| SDValue Res = |
| DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0), |
| DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32)); |
| if (Negated) |
| Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); |
| |
| return Res; |
| } |
| |
| if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) |
| return LowerToPredicatedOp(Op, DAG, PredOpcode); |
| |
| // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit |
| // operations, and truncate the result. |
| EVT WidenedVT; |
| if (VT == MVT::nxv16i8) |
| WidenedVT = MVT::nxv8i16; |
| else if (VT == MVT::nxv8i16) |
| WidenedVT = MVT::nxv4i32; |
| else |
| llvm_unreachable("Unexpected Custom DIV operation"); |
| |
| unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; |
| unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; |
| SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); |
| SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); |
| SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); |
| SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); |
| SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); |
| SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); |
| return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); |
| } |
| |
| bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { |
| // Currently no fixed length shuffles that require SVE are legal. |
| if (useSVEForFixedLengthVectorVT(VT)) |
| return false; |
| |
| if (VT.getVectorNumElements() == 4 && |
| (VT.is128BitVector() || VT.is64BitVector())) { |
| unsigned PFIndexes[4]; |
| for (unsigned i = 0; i != 4; ++i) { |
| if (M[i] < 0) |
| PFIndexes[i] = 8; |
| else |
| PFIndexes[i] = M[i]; |
| } |
| |
| // Compute the index in the perfect shuffle table. |
| unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + |
| PFIndexes[2] * 9 + PFIndexes[3]; |
| unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; |
| unsigned Cost = (PFEntry >> 30); |
| |
| if (Cost <= 4) |
| return true; |
| } |
| |
| bool DummyBool; |
| int DummyInt; |
| unsigned DummyUnsigned; |
| |
| return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || |
| isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || |
| isEXTMask(M, VT, DummyBool, DummyUnsigned) || |
| // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. |
| isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || |
| isZIPMask(M, VT, DummyUnsigned) || |
| isTRN_v_undef_Mask(M, VT, DummyUnsigned) || |
| isUZP_v_undef_Mask(M, VT, DummyUnsigned) || |
| isZIP_v_undef_Mask(M, VT, DummyUnsigned) || |
| isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || |
| isConcatMask(M, VT, VT.getSizeInBits() == 128)); |
| } |
| |
| /// getVShiftImm - Check if this is a valid build_vector for the immediate |
| /// operand of a vector shift operation, where all the elements of the |
| /// build_vector must have the same constant integer value. |
| static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { |
| // Ignore bit_converts. |
| while (Op.getOpcode() == ISD::BITCAST) |
| Op = Op.getOperand(0); |
| BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); |
| APInt SplatBits, SplatUndef; |
| unsigned SplatBitSize; |
| bool HasAnyUndefs; |
| if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, |
| HasAnyUndefs, ElementBits) || |
| SplatBitSize > ElementBits) |
| return false; |
| Cnt = SplatBits.getSExtValue(); |
| return true; |
| } |
| |
| /// isVShiftLImm - Check if this is a valid build_vector for the immediate |
| /// operand of a vector shift left operation. That value must be in the range: |
| /// 0 <= Value < ElementBits for a left shift; or |
| /// 0 <= Value <= ElementBits for a long left shift. |
| static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { |
| assert(VT.isVector() && "vector shift count is not a vector type"); |
| int64_t ElementBits = VT.getScalarSizeInBits(); |
| if (!getVShiftImm(Op, ElementBits, Cnt)) |
| return false; |
| return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); |
| } |
| |
| /// isVShiftRImm - Check if this is a valid build_vector for the immediate |
| /// operand of a vector shift right operation. The value must be in the range: |
| /// 1 <= Value <= ElementBits for a right shift; or |
| static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { |
| assert(VT.isVector() && "vector shift count is not a vector type"); |
| int64_t ElementBits = VT.getScalarSizeInBits(); |
| if (!getVShiftImm(Op, ElementBits, Cnt)) |
| return false; |
| return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| |
| if (VT.getScalarType() == MVT::i1) { |
| // Lower i1 truncate to `(x & 1) != 0`. |
| SDLoc dl(Op); |
| EVT OpVT = Op.getOperand(0).getValueType(); |
| SDValue Zero = DAG.getConstant(0, dl, OpVT); |
| SDValue One = DAG.getConstant(1, dl, OpVT); |
| SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); |
| return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); |
| } |
| |
| if (!VT.isVector() || VT.isScalableVector()) |
| return SDValue(); |
| |
| if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) |
| return LowerFixedLengthVectorTruncateToSVE(Op, DAG); |
| |
| return SDValue(); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| SDLoc DL(Op); |
| int64_t Cnt; |
| |
| if (!Op.getOperand(1).getValueType().isVector()) |
| return Op; |
| unsigned EltSize = VT.getScalarSizeInBits(); |
| |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("unexpected shift opcode"); |
| |
| case ISD::SHL: |
| if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); |
| |
| if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) |
| return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), |
| DAG.getConstant(Cnt, DL, MVT::i32)); |
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, |
| DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, |
| MVT::i32), |
| Op.getOperand(0), Op.getOperand(1)); |
| case ISD::SRA: |
| case ISD::SRL: |
| if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { |
| unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED |
| : AArch64ISD::SRL_PRED; |
| return LowerToPredicatedOp(Op, DAG, Opc); |
| } |
| |
| // Right shift immediate |
| if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { |
| unsigned Opc = |
| (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; |
| return DAG.getNode(Opc, DL, VT, Op.getOperand(0), |
| DAG.getConstant(Cnt, DL, MVT::i32)); |
| } |
| |
| // Right shift register. Note, there is not a shift right register |
| // instruction, but the shift left register instruction takes a signed |
| // value, where negative numbers specify a right shift. |
| unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl |
| : Intrinsic::aarch64_neon_ushl; |
| // negate the shift amount |
| SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), |
| Op.getOperand(1)); |
| SDValue NegShiftLeft = |
| DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, |
| DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), |
| NegShift); |
| return NegShiftLeft; |
| } |
| |
| return SDValue(); |
| } |
| |
| static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, |
| AArch64CC::CondCode CC, bool NoNans, EVT VT, |
| const SDLoc &dl, SelectionDAG &DAG) { |
| EVT SrcVT = LHS.getValueType(); |
| assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && |
| "function only supposed to emit natural comparisons"); |
| |
| BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); |
| APInt CnstBits(VT.getSizeInBits(), 0); |
| APInt UndefBits(VT.getSizeInBits(), 0); |
| bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); |
| bool IsZero = IsCnst && (CnstBits == 0); |
| |
| if (SrcVT.getVectorElementType().isFloatingPoint()) { |
| switch (CC) { |
| default: |
| return SDValue(); |
| case AArch64CC::NE: { |
| SDValue Fcmeq; |
| if (IsZero) |
| Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); |
| else |
| Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); |
| return DAG.getNOT(dl, Fcmeq, VT); |
| } |
| case AArch64CC::EQ: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); |
| case AArch64CC::GE: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); |
| case AArch64CC::GT: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); |
| case AArch64CC::LS: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); |
| case AArch64CC::LT: |
| if (!NoNans) |
| return SDValue(); |
| // If we ignore NaNs then we can use to the MI implementation. |
| LLVM_FALLTHROUGH; |
| case AArch64CC::MI: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); |
| } |
| } |
| |
| switch (CC) { |
| default: |
| return SDValue(); |
| case AArch64CC::NE: { |
| SDValue Cmeq; |
| if (IsZero) |
| Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); |
| else |
| Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); |
| return DAG.getNOT(dl, Cmeq, VT); |
| } |
| case AArch64CC::EQ: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); |
| case AArch64CC::GE: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); |
| case AArch64CC::GT: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); |
| case AArch64CC::LE: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); |
| case AArch64CC::LS: |
| return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); |
| case AArch64CC::LO: |
| return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); |
| case AArch64CC::LT: |
| if (IsZero) |
| return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); |
| return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); |
| case AArch64CC::HI: |
| return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); |
| case AArch64CC::HS: |
| return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, |
| SelectionDAG &DAG) const { |
| if (Op.getValueType().isScalableVector()) |
| return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); |
| |
| if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) |
| return LowerFixedLengthVectorSetccToSVE(Op, DAG); |
| |
| ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); |
| SDValue LHS = Op.getOperand(0); |
| SDValue RHS = Op.getOperand(1); |
| EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); |
| SDLoc dl(Op); |
| |
| if (LHS.getValueType().getVectorElementType().isInteger()) { |
| assert(LHS.getValueType() == RHS.getValueType()); |
| AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); |
| SDValue Cmp = |
| EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); |
| return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); |
| } |
| |
| const bool FullFP16 = |
| static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); |
| |
| // Make v4f16 (only) fcmp operations utilise vector instructions |
| // v8f16 support will be a litle more complicated |
| if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { |
| if (LHS.getValueType().getVectorNumElements() == 4) { |
| LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); |
| RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); |
| SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); |
| DAG.ReplaceAllUsesWith(Op, NewSetcc); |
| CmpVT = MVT::v4i32; |
| } else |
| return SDValue(); |
| } |
| |
| assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || |
| LHS.getValueType().getVectorElementType() != MVT::f128); |
| |
| // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally |
| // clean. Some of them require two branches to implement. |
| AArch64CC::CondCode CC1, CC2; |
| bool ShouldInvert; |
| changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); |
| |
| bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; |
| SDValue Cmp = |
| EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); |
| if (!Cmp.getNode()) |
| return SDValue(); |
| |
| if (CC2 != AArch64CC::AL) { |
| SDValue Cmp2 = |
| EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); |
| if (!Cmp2.getNode()) |
| return SDValue(); |
| |
| Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); |
| } |
| |
| Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); |
| |
| if (ShouldInvert) |
| Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); |
| |
| return Cmp; |
| } |
| |
| static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, |
| SelectionDAG &DAG) { |
| SDValue VecOp = ScalarOp.getOperand(0); |
| auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); |
| return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, |
| DAG.getConstant(0, DL, MVT::i64)); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDValue Src = Op.getOperand(0); |
| |
| // Try to lower fixed length reductions to SVE. |
| EVT SrcVT = Src.getValueType(); |
| bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || |
| Op.getOpcode() == ISD::VECREDUCE_OR || |
| Op.getOpcode() == ISD::VECREDUCE_XOR || |
| Op.getOpcode() == ISD::VECREDUCE_FADD || |
| (Op.getOpcode() != ISD::VECREDUCE_ADD && |
| SrcVT.getVectorElementType() == MVT::i64); |
| if (SrcVT.isScalableVector() || |
| useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { |
| |
| if (SrcVT.getVectorElementType() == MVT::i1) |
| return LowerPredReductionToSVE(Op, DAG); |
| |
| switch (Op.getOpcode()) { |
| case ISD::VECREDUCE_ADD: |
| return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); |
| case ISD::VECREDUCE_AND: |
| return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); |
| case ISD::VECREDUCE_OR: |
| return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); |
| case ISD::VECREDUCE_SMAX: |
| return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); |
| case ISD::VECREDUCE_SMIN: |
| return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); |
| case ISD::VECREDUCE_UMAX: |
| return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); |
| case ISD::VECREDUCE_UMIN: |
| return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); |
| case ISD::VECREDUCE_XOR: |
| return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); |
| case ISD::VECREDUCE_FADD: |
| return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); |
| case ISD::VECREDUCE_FMAX: |
| return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); |
| case ISD::VECREDUCE_FMIN: |
| return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); |
| default: |
| llvm_unreachable("Unhandled fixed length reduction"); |
| } |
| } |
| |
| // Lower NEON reductions. |
| SDLoc dl(Op); |
| switch (Op.getOpcode()) { |
| case ISD::VECREDUCE_ADD: |
| return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); |
| case ISD::VECREDUCE_SMAX: |
| return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); |
| case ISD::VECREDUCE_SMIN: |
| return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); |
| case ISD::VECREDUCE_UMAX: |
| return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); |
| case ISD::VECREDUCE_UMIN: |
| return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); |
| case ISD::VECREDUCE_FMAX: { |
| return DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), |
| DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), |
| Src); |
| } |
| case ISD::VECREDUCE_FMIN: { |
| return DAG.getNode( |
| ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), |
| DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), |
| Src); |
| } |
| default: |
| llvm_unreachable("Unhandled reduction"); |
| } |
| } |
| |
| SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, |
| SelectionDAG &DAG) const { |
| auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); |
| if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) |
| return SDValue(); |
| |
| // LSE has an atomic load-add instruction, but not a load-sub. |
| SDLoc dl(Op); |
| MVT VT = Op.getSimpleValueType(); |
| SDValue RHS = Op.getOperand(2); |
| AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); |
| RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); |
| return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), |
| Op.getOperand(0), Op.getOperand(1), RHS, |
| AN->getMemOperand()); |
| } |
| |
| SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, |
| SelectionDAG &DAG) const { |
| auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); |
| if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) |
| return SDValue(); |
| |
| // LSE has an atomic load-clear instruction, but not a load-and. |
| SDLoc dl(Op); |
| MVT VT = Op.getSimpleValueType(); |
| SDValue RHS = Op.getOperand(2); |
| AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); |
| RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); |
| return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), |
| Op.getOperand(0), Op.getOperand(1), RHS, |
| AN->getMemOperand()); |
| } |
| |
| SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( |
| SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| EVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); |
| |
| const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); |
| const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); |
| if (Subtarget->hasCustomCallingConv()) |
| TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); |
| |
| Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, |
| DAG.getConstant(4, dl, MVT::i64)); |
| Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); |
| Chain = |
| DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), |
| Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), |
| DAG.getRegisterMask(Mask), Chain.getValue(1)); |
| // To match the actual intent better, we should read the output from X15 here |
| // again (instead of potentially spilling it to the stack), but rereading Size |
| // from X15 here doesn't work at -O0, since it thinks that X15 is undefined |
| // here. |
| |
| Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, |
| DAG.getConstant(4, dl, MVT::i64)); |
| return Chain; |
| } |
| |
| SDValue |
| AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
| SelectionDAG &DAG) const { |
| assert(Subtarget->isTargetWindows() && |
| "Only Windows alloca probing supported"); |
| SDLoc dl(Op); |
| // Get the inputs. |
| SDNode *Node = Op.getNode(); |
| SDValue Chain = Op.getOperand(0); |
| SDValue Size = Op.getOperand(1); |
| MaybeAlign Align = |
| cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); |
| EVT VT = Node->getValueType(0); |
| |
| if (DAG.getMachineFunction().getFunction().hasFnAttribute( |
| "no-stack-arg-probe")) { |
| SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); |
| Chain = SP.getValue(1); |
| SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); |
| if (Align) |
| SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), |
| DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); |
| Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); |
| SDValue Ops[2] = {SP, Chain}; |
| return DAG.getMergeValues(Ops, dl); |
| } |
| |
| Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); |
| |
| Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); |
| |
| SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); |
| Chain = SP.getValue(1); |
| SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); |
| if (Align) |
| SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), |
| DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); |
| Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); |
| |
| Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), |
| DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); |
| |
| SDValue Ops[2] = {SP, Chain}; |
| return DAG.getMergeValues(Ops, dl); |
| } |
| |
| SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, |
| SelectionDAG &DAG) const { |
| EVT VT = Op.getValueType(); |
| assert(VT != MVT::i64 && "Expected illegal VSCALE node"); |
| |
| SDLoc DL(Op); |
| APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); |
| return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), |
| DL, VT); |
| } |
| |
| /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. |
| template <unsigned NumVecs> |
| static bool |
| setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, |
| AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { |
| Info.opc = ISD::INTRINSIC_VOID; |
| // Retrieve EC from first vector argument. |
| const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); |
| ElementCount EC = VT.getVectorElementCount(); |
| #ifndef NDEBUG |
| // Check the assumption that all input vectors are the same type. |
| for (unsigned I = 0; I < NumVecs; ++I) |
| assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && |
| "Invalid type."); |
| #endif |
| // memVT is `NumVecs * VT`. |
| Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), |
| EC * NumVecs); |
| Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1); |
| Info.offset = 0; |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| } |
| |
| /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as |
| /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment |
| /// specified in the intrinsic calls. |
| bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
| const CallInst &I, |
| MachineFunction &MF, |
| unsigned Intrinsic) const { |
| auto &DL = I.getModule()->getDataLayout(); |
| switch (Intrinsic) { |
| case Intrinsic::aarch64_sve_st2: |
| return setInfoSVEStN<2>(*this, DL, Info, I); |
| case Intrinsic::aarch64_sve_st3: |
| return setInfoSVEStN<3>(*this, DL, Info, I); |
| case Intrinsic::aarch64_sve_st4: |
| return setInfoSVEStN<4>(*this, DL, Info, I); |
| case Intrinsic::aarch64_neon_ld2: |
| case Intrinsic::aarch64_neon_ld3: |
| case Intrinsic::aarch64_neon_ld4: |
| case Intrinsic::aarch64_neon_ld1x2: |
| case Intrinsic::aarch64_neon_ld1x3: |
| case Intrinsic::aarch64_neon_ld1x4: |
| case Intrinsic::aarch64_neon_ld2lane: |
| case Intrinsic::aarch64_neon_ld3lane: |
| case Intrinsic::aarch64_neon_ld4lane: |
| case Intrinsic::aarch64_neon_ld2r: |
| case Intrinsic::aarch64_neon_ld3r: |
| case Intrinsic::aarch64_neon_ld4r: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| // Conservatively set memVT to the entire set of vectors loaded. |
| uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; |
| Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); |
| Info.ptrVal = I.getArgOperand(I.arg_size() - 1); |
| Info.offset = 0; |
| Info.align.reset(); |
| // volatile loads with NEON intrinsics not supported |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| } |
| case Intrinsic::aarch64_neon_st2: |
| case Intrinsic::aarch64_neon_st3: |
| case Intrinsic::aarch64_neon_st4: |
| case Intrinsic::aarch64_neon_st1x2: |
| case Intrinsic::aarch64_neon_st1x3: |
| case Intrinsic::aarch64_neon_st1x4: |
| case Intrinsic::aarch64_neon_st2lane: |
| case Intrinsic::aarch64_neon_st3lane: |
| case Intrinsic::aarch64_neon_st4lane: { |
| Info.opc = ISD::INTRINSIC_VOID; |
| // Conservatively set memVT to the entire set of vectors stored. |
| unsigned NumElts = 0; |
| for (const Value *Arg : I.args()) { |
| Type *ArgTy = Arg->getType(); |
| if (!ArgTy->isVectorTy()) |
| break; |
| NumElts += DL.getTypeSizeInBits(ArgTy) / 64; |
| } |
| Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); |
| Info.ptrVal = I.getArgOperand(I.arg_size() - 1); |
| Info.offset = 0; |
| Info.align.reset(); |
| // volatile stores with NEON intrinsics not supported |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| } |
| case Intrinsic::aarch64_ldaxr: |
| case Intrinsic::aarch64_ldxr: { |
| PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(PtrTy->getElementType()); |
| Info.ptrVal = I.getArgOperand(0); |
| Info.offset = 0; |
| Info.align = DL.getABITypeAlign(PtrTy->getElementType()); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; |
| return true; |
| } |
| case Intrinsic::aarch64_stlxr: |
| case Intrinsic::aarch64_stxr: { |
| PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(PtrTy->getElementType()); |
| Info.ptrVal = I.getArgOperand(1); |
| Info.offset = 0; |
| Info.align = DL.getABITypeAlign(PtrTy->getElementType()); |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; |
| return true; |
| } |
| case Intrinsic::aarch64_ldaxp: |
| case Intrinsic::aarch64_ldxp: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::i128; |
| Info.ptrVal = I.getArgOperand(0); |
| Info.offset = 0; |
| Info.align = Align(16); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; |
| return true; |
| case Intrinsic::aarch64_stlxp: |
| case Intrinsic::aarch64_stxp: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::i128; |
| Info.ptrVal = I.getArgOperand(2); |
| Info.offset = 0; |
| Info.align = Align(16); |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; |
| return true; |
| case Intrinsic::aarch64_sve_ldnt1: { |
| PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getType()); |
| Info.ptrVal = I.getArgOperand(1); |
| Info.offset = 0; |
| Info.align = DL.getABITypeAlign(PtrTy->getElementType()); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; |
| return true; |
| } |
| case Intrinsic::aarch64_sve_stnt1: { |
| PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getOperand(0)->getType()); |
| Info.ptrVal = I.getArgOperand(2); |
| Info.offset = 0; |
| Info.align = DL.getABITypeAlign(PtrTy->getElementType()); |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; |
| return true; |
| } |
| default: |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, |
| ISD::LoadExtType ExtTy, |
| EVT NewVT) const { |
| // TODO: This may be worth removing. Check regression tests for diffs. |
| if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) |
| return false; |
| |
| // If we're reducing the load width in order to avoid having to use an extra |
| // instruction to do extension then it's probably a good idea. |
| if (ExtTy != ISD::NON_EXTLOAD) |
| return true; |
| // Don't reduce load width if it would prevent us from combining a shift into |
| // the offset. |
| MemSDNode *Mem = dyn_cast<MemSDNode>(Load); |
| assert(Mem); |
| const SDValue &Base = Mem->getBasePtr(); |
| if (Base.getOpcode() == ISD::ADD && |
| Base.getOperand(1).getOpcode() == ISD::SHL && |
| Base.getOperand(1).hasOneUse() && |
| Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { |
| // The shift can be combined if it matches the size of the value being |
| // loaded (and so reducing the width would make it not match). |
| uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); |
| uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; |
| if (ShiftAmount == Log2_32(LoadBytes)) |
| return false; |
| } |
| // We have no reason to disallow reducing the load width, so allow it. |
| return true; |
| } |
| |
| // Truncations from 64-bit GPR to 32-bit GPR is free. |
| bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { |
| if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) |
| return false; |
| uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize(); |
| uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize(); |
| return NumBits1 > NumBits2; |
| } |
| bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { |
| if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) |
| return false; |
| uint64_t NumBits1 = VT1.getFixedSizeInBits(); |
| uint64_t NumBits2 = VT2.getFixedSizeInBits(); |
| return NumBits1 > NumBits2; |
| } |
| |
| /// Check if it is profitable to hoist instruction in then/else to if. |
| /// Not profitable if I and it's user can form a FMA instruction |
| /// because we prefer FMSUB/FMADD. |
| bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { |
| if (I->getOpcode() != Instruction::FMul) |
| return true; |
| |
| if (!I->hasOneUse()) |
| return true; |
| |
| Instruction *User = I->user_back(); |
| |
| if (User && |
| !(User->getOpcode() == Instruction::FSub || |
| User->getOpcode() == Instruction::FAdd)) |
| return true; |
| |
| const TargetOptions &Options = getTargetMachine().Options; |
| const Function *F = I->getFunction(); |
| const DataLayout &DL = F->getParent()->getDataLayout(); |
| Type *Ty = User->getOperand(0)->getType(); |
| |
| return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && |
| isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && |
| (Options.AllowFPOpFusion == FPOpFusion::Fast || |
| Options.UnsafeFPMath)); |
| } |
| |
| // All 32-bit GPR operations implicitly zero the high-half of the corresponding |
| // 64-bit GPR. |
| bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { |
| if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) |
| return false; |
| unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); |
| unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); |
| return NumBits1 == 32 && NumBits2 == 64; |
| } |
| bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { |
| if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) |
| return false; |
| unsigned NumBits1 = VT1.getSizeInBits(); |
| unsigned NumBits2 = VT2.getSizeInBits(); |
| return NumBits1 == 32 && NumBits2 == 64; |
| } |
| |
| bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { |
| EVT VT1 = Val.getValueType(); |
| if (isZExtFree(VT1, VT2)) { |
| return true; |
| } |
| |
| if (Val.getOpcode() != ISD::LOAD) |
| return false; |
| |
| // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. |
| return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && |
| VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && |
| VT1.getSizeInBits() <= 32); |
| } |
| |
| bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { |
| if (isa<FPExtInst>(Ext)) |
| return false; |
| |
| // Vector types are not free. |
| if (Ext->getType()->isVectorTy()) |
| return false; |
| |
| for (const Use &U : Ext->uses()) { |
| // The extension is free if we can fold it with a left shift in an |
| // addressing mode or an arithmetic operation: add, sub, and cmp. |
| |
| // Is there a shift? |
| const Instruction *Instr = cast<Instruction>(U.getUser()); |
| |
| // Is this a constant shift? |
| switch (Instr->getOpcode()) { |
| case Instruction::Shl: |
| if (!isa<ConstantInt>(Instr->getOperand(1))) |
| return false; |
| break; |
| case Instruction::GetElementPtr: { |
| gep_type_iterator GTI = gep_type_begin(Instr); |
| auto &DL = Ext->getModule()->getDataLayout(); |
| std::advance(GTI, U.getOperandNo()-1); |
| Type *IdxTy = GTI.getIndexedType(); |
| // This extension will end up with a shift because of the scaling factor. |
| // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. |
| // Get the shift amount based on the scaling factor: |
| // log2(sizeof(IdxTy)) - log2(8). |
| uint64_t ShiftAmt = |
| countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; |
| // Is the constant foldable in the shift of the addressing mode? |
| // I.e., shift amount is between 1 and 4 inclusive. |
| if (ShiftAmt == 0 || ShiftAmt > 4) |
| return false; |
| break; |
| } |
| case Instruction::Trunc: |
| // Check if this is a noop. |
| // trunc(sext ty1 to ty2) to ty1. |
| if (Instr->getType() == Ext->getOperand(0)->getType()) |
| continue; |
| LLVM_FALLTHROUGH; |
| default: |
| return false; |
| } |
| |
| // At this point we can use the bfm family, so this extension is free |
| // for that use. |
| } |
| return true; |
| } |
| |
| /// Check if both Op1 and Op2 are shufflevector extracts of either the lower |
| /// or upper half of the vector elements. |
| static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { |
| auto areTypesHalfed = [](Value *FullV, Value *HalfV) { |
| auto *FullTy = FullV->getType(); |
| auto *HalfTy = HalfV->getType(); |
| return FullTy->getPrimitiveSizeInBits().getFixedSize() == |
| 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); |
| }; |
| |
| auto extractHalf = [](Value *FullV, Value *HalfV) { |
| auto *FullVT = cast<FixedVectorType>(FullV->getType()); |
| auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); |
| return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); |
| }; |
| |
| ArrayRef<int> M1, M2; |
| Value *S1Op1, *S2Op1; |
| if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || |
| !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) |
| return false; |
| |
| // Check that the operands are half as wide as the result and we extract |
| // half of the elements of the input vectors. |
| if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || |
| !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) |
| return false; |
| |
| // Check the mask extracts either the lower or upper half of vector |
| // elements. |
| int M1Start = -1; |
| int M2Start = -1; |
| int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; |
| if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || |
| !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || |
| M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) |
| return false; |
| |
| return true; |
| } |
| |
| /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth |
| /// of the vector elements. |
| static bool areExtractExts(Value *Ext1, Value *Ext2) { |
| auto areExtDoubled = [](Instruction *Ext) { |
| return Ext->getType()->getScalarSizeInBits() == |
| 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); |
| }; |
| |
| if (!match(Ext1, m_ZExtOrSExt(m_Value())) || |
| !match(Ext2, m_ZExtOrSExt(m_Value())) || |
| !areExtDoubled(cast<Instruction>(Ext1)) || |
| !areExtDoubled(cast<Instruction>(Ext2))) |
| return false; |
| |
| return true; |
| } |
| |
| /// Check if Op could be used with vmull_high_p64 intrinsic. |
| static bool isOperandOfVmullHighP64(Value *Op) { |
| Value *VectorOperand = nullptr; |
| ConstantInt *ElementIndex = nullptr; |
| return match(Op, m_ExtractElt(m_Value(VectorOperand), |
| m_ConstantInt(ElementIndex))) && |
| ElementIndex->getValue() == 1 && |
| isa<FixedVectorType>(VectorOperand->getType()) && |
| cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; |
| } |
| |
| /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. |
| static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { |
| return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); |
| } |
| |
| static bool isSplatShuffle(Value *V) { |
| if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) |
| return is_splat(Shuf->getShuffleMask()); |
| return false; |
| } |
| |
| /// Check if sinking \p I's operands to I's basic block is profitable, because |
| /// the operands can be folded into a target instruction, e.g. |
| /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). |
| bool AArch64TargetLowering::shouldSinkOperands( |
| Instruction *I, SmallVectorImpl<Use *> &Ops) const { |
| if (!I->getType()->isVectorTy()) |
| return false; |
| |
| if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { |
| switch (II->getIntrinsicID()) { |
| case Intrinsic::aarch64_neon_smull: |
| case Intrinsic::aarch64_neon_umull: |
| if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) { |
| Ops.push_back(&II->getOperandUse(0)); |
| Ops.push_back(&II->getOperandUse(1)); |
| return true; |
| } |
| LLVM_FALLTHROUGH; |
| |
| case Intrinsic::aarch64_neon_sqdmull: |
| case Intrinsic::aarch64_neon_sqdmulh: |
| case Intrinsic::aarch64_neon_sqrdmulh: |
| // Sink splats for index lane variants |
| if (isSplatShuffle(II->getOperand(0))) |
| Ops.push_back(&II->getOperandUse(0)); |
| if (isSplatShuffle(II->getOperand(1))) |
| Ops.push_back(&II->getOperandUse(1)); |
| return !Ops.empty(); |
| |
| case Intrinsic::aarch64_neon_pmull64: |
| if (!areOperandsOfVmullHighP64(II->getArgOperand(0), |
| II->getArgOperand(1))) |
| return false; |
| Ops.push_back(&II->getArgOperandUse(0)); |
| Ops.push_back(&II->getArgOperandUse(1)); |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| switch (I->getOpcode()) { |
| case Instruction::Sub: |
| case Instruction::Add: { |
| if (!areExtractExts(I->getOperand(0), I->getOperand(1))) |
| return false; |
| |
| // If the exts' operands extract either the lower or upper elements, we |
| // can sink them too. |
| auto Ext1 = cast<Instruction>(I->getOperand(0)); |
| auto Ext2 = cast<Instruction>(I->getOperand(1)); |
| if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { |
| Ops.push_back(&Ext1->getOperandUse(0)); |
| Ops.push_back(&Ext2->getOperandUse(0)); |
| } |
| |
| Ops.push_back(&I->getOperandUse(0)); |
| Ops.push_back(&I->getOperandUse(1)); |
| |
| return true; |
| } |
| case Instruction::Mul: { |
| bool IsProfitable = false; |
| for (auto &Op : I->operands()) { |
| // Make sure we are not already sinking this operand |
| if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) |
| continue; |
| |
| ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); |
| if (!Shuffle || !Shuffle->isZeroEltSplat()) |
| continue; |
| |
| Value *ShuffleOperand = Shuffle->getOperand(0); |
| InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); |
| if (!Insert) |
| continue; |
| |
| Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); |
| if (!OperandInstr) |
| continue; |
| |
| ConstantInt *ElementConstant = |
| dyn_cast<ConstantInt>(Insert->getOperand(2)); |
| // Check that the insertelement is inserting into element 0 |
| if (!ElementConstant || ElementConstant->getZExtValue() != 0) |
| continue; |
| |
| unsigned Opcode = OperandInstr->getOpcode(); |
| if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) |
| continue; |
| |
| Ops.push_back(&Shuffle->getOperandUse(0)); |
| Ops.push_back(&Op); |
| IsProfitable = true; |
| } |
| |
| return IsProfitable; |
| } |
| default: |
| return false; |
| } |
| return false; |
| } |
| |
| bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, |
| Align &RequiredAligment) const { |
| if (!LoadedType.isSimple() || |
| (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) |
| return false; |
| // Cyclone supports unaligned accesses. |
| RequiredAligment = Align(1); |
| unsigned NumBits = LoadedType.getSizeInBits(); |
| return NumBits == 32 || NumBits == 64; |
| } |
| |
| /// A helper function for determining the number of interleaved accesses we |
| /// will generate when lowering accesses of the given type. |
| unsigned AArch64TargetLowering::getNumInterleavedAccesses( |
| VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { |
| unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128; |
| return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); |
| } |
| |
| MachineMemOperand::Flags |
| AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { |
| if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && |
| I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) |
| return MOStridedAccess; |
| return MachineMemOperand::MONone; |
| } |
| |
| bool AArch64TargetLowering::isLegalInterleavedAccessType( |
| VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { |
| |
| unsigned VecSize = DL.getTypeSizeInBits(VecTy); |
| unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); |
| unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); |
| |
| UseScalable = false; |
| |
| // Ensure the number of vector elements is greater than 1. |
| if (NumElements < 2) |
| return false; |
| |
| // Ensure the element type is legal. |
| if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) |
| return false; |
| |
| if (Subtarget->useSVEForFixedLengthVectors() && |
| (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || |
| (VecSize < Subtarget->getMinSVEVectorSizeInBits() && |
| isPowerOf2_32(NumElements) && VecSize > 128))) { |
| UseScalable = true; |
| return true; |
| } |
| |
| // Ensure the total vector size is 64 or a multiple of 128. Types larger than |
| // 128 will be split into multiple interleaved accesses. |
| return VecSize == 64 || VecSize % 128 == 0; |
| } |
| |
| static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) { |
| if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 2); |
| |
| if (VTy->getElementType() == Type::getFloatTy(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 4); |
| |
| if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 8); |
| |
| if (VTy->getElementType() == Type::getHalfTy(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 8); |
| |
| if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 2); |
| |
| if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 4); |
| |
| if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 8); |
| |
| if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext())) |
| return ScalableVectorType::get(VTy->getElementType(), 16); |
| |
| llvm_unreachable("Cannot handle input vector type"); |
| } |
| |
| /// Lower an interleaved load into a ldN intrinsic. |
| /// |
| /// E.g. Lower an interleaved load (Factor = 2): |
| /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr |
| /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements |
| /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements |
| /// |
| /// Into: |
| /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) |
| /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 |
| /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 |
| bool AArch64TargetLowering::lowerInterleavedLoad( |
| LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, |
| ArrayRef<unsigned> Indices, unsigned Factor) const { |
| assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && |
| "Invalid interleave factor"); |
| assert(!Shuffles.empty() && "Empty shufflevector input"); |
| assert(Shuffles.size() == Indices.size() && |
| "Unmatched number of shufflevectors and indices"); |
| |
| const DataLayout &DL = LI->getModule()->getDataLayout(); |
| |
| VectorType *VTy = Shuffles[0]->getType(); |
| |
| // Skip if we do not have NEON and skip illegal vector types. We can |
| // "legalize" wide vector types into multiple interleaved accesses as long as |
| // the vector types are divisible by 128. |
| bool UseScalable; |
| if (!Subtarget->hasNEON() || |
| !isLegalInterleavedAccessType(VTy, DL, UseScalable)) |
| return false; |
| |
| unsigned <
|