| //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// Custom DAG lowering for SI |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "SIISelLowering.h" |
| #include "AMDGPU.h" |
| #include "AMDGPUInstrInfo.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "SIRegisterInfo.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/Analysis/LegacyDivergenceAnalysis.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/BinaryFormat/ELF.h" |
| #include "llvm/CodeGen/Analysis.h" |
| #include "llvm/CodeGen/FunctionLoweringInfo.h" |
| #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineLoopInfo.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| #include "llvm/IR/IntrinsicsR600.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/KnownBits.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "si-lower" |
| |
| STATISTIC(NumTailCalls, "Number of tail calls"); |
| |
| static cl::opt<bool> DisableLoopAlignment( |
| "amdgpu-disable-loop-alignment", |
| cl::desc("Do not align and prefetch loops"), |
| cl::init(false)); |
| |
| static cl::opt<bool> VGPRReserveforSGPRSpill( |
| "amdgpu-reserve-vgpr-for-sgpr-spill", |
| cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); |
| |
| static cl::opt<bool> UseDivergentRegisterIndexing( |
| "amdgpu-use-divergent-register-indexing", |
| cl::Hidden, |
| cl::desc("Use indirect register addressing for divergent indexes"), |
| cl::init(false)); |
| |
| static bool hasFP32Denormals(const MachineFunction &MF) { |
| const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| return Info->getMode().allFP32Denormals(); |
| } |
| |
| static bool hasFP64FP16Denormals(const MachineFunction &MF) { |
| const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| return Info->getMode().allFP64FP16Denormals(); |
| } |
| |
| static unsigned findFirstFreeSGPR(CCState &CCInfo) { |
| unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); |
| for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { |
| if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { |
| return AMDGPU::SGPR0 + Reg; |
| } |
| } |
| llvm_unreachable("Cannot allocate sgpr"); |
| } |
| |
| SITargetLowering::SITargetLowering(const TargetMachine &TM, |
| const GCNSubtarget &STI) |
| : AMDGPUTargetLowering(TM, STI), |
| Subtarget(&STI) { |
| addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); |
| addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); |
| |
| addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); |
| addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); |
| |
| addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); |
| |
| const SIRegisterInfo *TRI = STI.getRegisterInfo(); |
| const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); |
| |
| addRegisterClass(MVT::f64, V64RegClass); |
| addRegisterClass(MVT::v2f32, V64RegClass); |
| |
| addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); |
| addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); |
| |
| addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); |
| addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); |
| |
| addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); |
| addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); |
| |
| addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); |
| addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); |
| |
| addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); |
| addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); |
| |
| addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); |
| addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); |
| |
| addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); |
| addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); |
| |
| addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); |
| addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); |
| |
| addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); |
| addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); |
| |
| addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); |
| addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); |
| |
| addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); |
| addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); |
| |
| addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); |
| addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); |
| |
| if (Subtarget->has16BitInsts()) { |
| addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); |
| addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); |
| |
| // Unless there are also VOP3P operations, not operations are really legal. |
| addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); |
| addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); |
| addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); |
| addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); |
| } |
| |
| addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); |
| addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); |
| |
| computeRegisterProperties(Subtarget->getRegisterInfo()); |
| |
| // The boolean content concept here is too inflexible. Compares only ever |
| // really produce a 1-bit result. Any copy/extend from these will turn into a |
| // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as |
| // it's what most targets use. |
| setBooleanContents(ZeroOrOneBooleanContent); |
| setBooleanVectorContents(ZeroOrOneBooleanContent); |
| |
| // We need to custom lower vector stores from local memory |
| setOperationAction(ISD::LOAD, MVT::v2i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v3i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v4i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v5i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v6i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v7i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v8i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v16i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::i1, Custom); |
| setOperationAction(ISD::LOAD, MVT::v32i32, Custom); |
| |
| setOperationAction(ISD::STORE, MVT::v2i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v3i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v4i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v5i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v6i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v7i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v8i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v16i32, Custom); |
| setOperationAction(ISD::STORE, MVT::i1, Custom); |
| setOperationAction(ISD::STORE, MVT::v32i32, Custom); |
| |
| setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); |
| setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); |
| setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); |
| setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); |
| setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); |
| setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); |
| setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); |
| setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); |
| setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); |
| setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); |
| setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); |
| setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); |
| setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); |
| setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); |
| setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); |
| setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); |
| |
| setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); |
| setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); |
| setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); |
| setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); |
| setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); |
| setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); |
| setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); |
| |
| setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); |
| setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); |
| |
| setOperationAction(ISD::SELECT, MVT::i1, Promote); |
| setOperationAction(ISD::SELECT, MVT::i64, Custom); |
| setOperationAction(ISD::SELECT, MVT::f64, Promote); |
| AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); |
| |
| setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); |
| |
| setOperationAction(ISD::SETCC, MVT::i1, Promote); |
| setOperationAction(ISD::SETCC, MVT::v2i1, Expand); |
| setOperationAction(ISD::SETCC, MVT::v4i1, Expand); |
| AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); |
| |
| setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); |
| setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); |
| setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand); |
| |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); |
| |
| setOperationAction(ISD::BRCOND, MVT::Other, Custom); |
| setOperationAction(ISD::BR_CC, MVT::i1, Expand); |
| setOperationAction(ISD::BR_CC, MVT::i32, Expand); |
| setOperationAction(ISD::BR_CC, MVT::i64, Expand); |
| setOperationAction(ISD::BR_CC, MVT::f32, Expand); |
| setOperationAction(ISD::BR_CC, MVT::f64, Expand); |
| |
| setOperationAction(ISD::UADDO, MVT::i32, Legal); |
| setOperationAction(ISD::USUBO, MVT::i32, Legal); |
| |
| setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); |
| setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); |
| |
| setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); |
| setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); |
| setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand); |
| |
| #if 0 |
| setOperationAction(ISD::ADDCARRY, MVT::i64, Legal); |
| setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); |
| #endif |
| |
| // We only support LOAD/STORE and vector manipulation ops for vectors |
| // with > 4 elements. |
| for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, |
| MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, |
| MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, |
| MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, |
| MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { |
| for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
| switch (Op) { |
| case ISD::LOAD: |
| case ISD::STORE: |
| case ISD::BUILD_VECTOR: |
| case ISD::BITCAST: |
| case ISD::EXTRACT_VECTOR_ELT: |
| case ISD::INSERT_VECTOR_ELT: |
| case ISD::EXTRACT_SUBVECTOR: |
| case ISD::SCALAR_TO_VECTOR: |
| break; |
| case ISD::INSERT_SUBVECTOR: |
| case ISD::CONCAT_VECTORS: |
| setOperationAction(Op, VT, Custom); |
| break; |
| default: |
| setOperationAction(Op, VT, Expand); |
| break; |
| } |
| } |
| } |
| |
| setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); |
| |
| // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that |
| // is expanded to avoid having two separate loops in case the index is a VGPR. |
| |
| // Most operations are naturally 32-bit vector operations. We only support |
| // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. |
| for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); |
| |
| setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); |
| } |
| |
| for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); |
| |
| setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); |
| } |
| |
| for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); |
| |
| setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); |
| } |
| |
| for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); |
| |
| setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); |
| } |
| |
| for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
| AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); |
| |
| setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
| AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); |
| } |
| |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); |
| |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); |
| |
| // Avoid stack access for these. |
| // TODO: Generalize to more vector types. |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); |
| |
| // Deal with vec3 vector operations when widened to vec4. |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); |
| |
| // Deal with vec5/6/7 vector operations when widened to vec8. |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); |
| |
| // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, |
| // and output demarshalling |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); |
| |
| // We can't return success/failure, only the old value, |
| // let LLVM add the comparison |
| setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); |
| setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); |
| |
| if (Subtarget->hasFlatAddressSpace()) { |
| setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); |
| setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); |
| } |
| |
| setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); |
| |
| // FIXME: This should be narrowed to i32, but that only happens if i64 is |
| // illegal. |
| // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. |
| setOperationAction(ISD::BSWAP, MVT::i64, Legal); |
| setOperationAction(ISD::BSWAP, MVT::i32, Legal); |
| |
| // On SI this is s_memtime and s_memrealtime on VI. |
| setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); |
| setOperationAction(ISD::TRAP, MVT::Other, Custom); |
| setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); |
| |
| if (Subtarget->has16BitInsts()) { |
| setOperationAction(ISD::FPOW, MVT::f16, Promote); |
| setOperationAction(ISD::FPOWI, MVT::f16, Promote); |
| setOperationAction(ISD::FLOG, MVT::f16, Custom); |
| setOperationAction(ISD::FEXP, MVT::f16, Custom); |
| setOperationAction(ISD::FLOG10, MVT::f16, Custom); |
| } |
| |
| if (Subtarget->hasMadMacF32Insts()) |
| setOperationAction(ISD::FMAD, MVT::f32, Legal); |
| |
| if (!Subtarget->hasBFI()) { |
| // fcopysign can be done in a single instruction with BFI. |
| setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); |
| } |
| |
| if (!Subtarget->hasBCNT(32)) |
| setOperationAction(ISD::CTPOP, MVT::i32, Expand); |
| |
| if (!Subtarget->hasBCNT(64)) |
| setOperationAction(ISD::CTPOP, MVT::i64, Expand); |
| |
| if (Subtarget->hasFFBH()) { |
| setOperationAction(ISD::CTLZ, MVT::i32, Custom); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); |
| } |
| |
| if (Subtarget->hasFFBL()) { |
| setOperationAction(ISD::CTTZ, MVT::i32, Custom); |
| setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); |
| } |
| |
| // We only really have 32-bit BFE instructions (and 16-bit on VI). |
| // |
| // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any |
| // effort to match them now. We want this to be false for i64 cases when the |
| // extraction isn't restricted to the upper or lower half. Ideally we would |
| // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that |
| // span the midpoint are probably relatively rare, so don't worry about them |
| // for now. |
| if (Subtarget->hasBFE()) |
| setHasExtractBitsInsn(true); |
| |
| // Clamp modifier on add/sub |
| if (Subtarget->hasIntClamp()) { |
| setOperationAction(ISD::UADDSAT, MVT::i32, Legal); |
| setOperationAction(ISD::USUBSAT, MVT::i32, Legal); |
| } |
| |
| if (Subtarget->hasAddNoCarry()) { |
| setOperationAction(ISD::SADDSAT, MVT::i16, Legal); |
| setOperationAction(ISD::SSUBSAT, MVT::i16, Legal); |
| setOperationAction(ISD::SADDSAT, MVT::i32, Legal); |
| setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); |
| } |
| |
| setOperationAction(ISD::FMINNUM, MVT::f32, Custom); |
| setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); |
| setOperationAction(ISD::FMINNUM, MVT::f64, Custom); |
| setOperationAction(ISD::FMAXNUM, MVT::f64, Custom); |
| |
| |
| // These are really only legal for ieee_mode functions. We should be avoiding |
| // them for functions that don't have ieee_mode enabled, so just say they are |
| // legal. |
| setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); |
| setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); |
| setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); |
| setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); |
| |
| |
| if (Subtarget->haveRoundOpsF64()) { |
| setOperationAction(ISD::FTRUNC, MVT::f64, Legal); |
| setOperationAction(ISD::FCEIL, MVT::f64, Legal); |
| setOperationAction(ISD::FRINT, MVT::f64, Legal); |
| } else { |
| setOperationAction(ISD::FCEIL, MVT::f64, Custom); |
| setOperationAction(ISD::FTRUNC, MVT::f64, Custom); |
| setOperationAction(ISD::FRINT, MVT::f64, Custom); |
| setOperationAction(ISD::FFLOOR, MVT::f64, Custom); |
| } |
| |
| setOperationAction(ISD::FFLOOR, MVT::f64, Legal); |
| |
| setOperationAction(ISD::FSIN, MVT::f32, Custom); |
| setOperationAction(ISD::FCOS, MVT::f32, Custom); |
| setOperationAction(ISD::FDIV, MVT::f32, Custom); |
| setOperationAction(ISD::FDIV, MVT::f64, Custom); |
| |
| if (Subtarget->has16BitInsts()) { |
| setOperationAction(ISD::Constant, MVT::i16, Legal); |
| |
| setOperationAction(ISD::SMIN, MVT::i16, Legal); |
| setOperationAction(ISD::SMAX, MVT::i16, Legal); |
| |
| setOperationAction(ISD::UMIN, MVT::i16, Legal); |
| setOperationAction(ISD::UMAX, MVT::i16, Legal); |
| |
| setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); |
| AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); |
| |
| setOperationAction(ISD::ROTR, MVT::i16, Expand); |
| setOperationAction(ISD::ROTL, MVT::i16, Expand); |
| |
| setOperationAction(ISD::SDIV, MVT::i16, Promote); |
| setOperationAction(ISD::UDIV, MVT::i16, Promote); |
| setOperationAction(ISD::SREM, MVT::i16, Promote); |
| setOperationAction(ISD::UREM, MVT::i16, Promote); |
| setOperationAction(ISD::UADDSAT, MVT::i16, Legal); |
| setOperationAction(ISD::USUBSAT, MVT::i16, Legal); |
| |
| setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); |
| |
| setOperationAction(ISD::CTTZ, MVT::i16, Promote); |
| setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); |
| setOperationAction(ISD::CTLZ, MVT::i16, Promote); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); |
| setOperationAction(ISD::CTPOP, MVT::i16, Promote); |
| |
| setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); |
| |
| setOperationAction(ISD::BR_CC, MVT::i16, Expand); |
| |
| setOperationAction(ISD::LOAD, MVT::i16, Custom); |
| |
| setTruncStoreAction(MVT::i64, MVT::i16, Expand); |
| |
| setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); |
| AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); |
| setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); |
| AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); |
| |
| setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); |
| |
| // F16 - Constant Actions. |
| setOperationAction(ISD::ConstantFP, MVT::f16, Legal); |
| |
| // F16 - Load/Store Actions. |
| setOperationAction(ISD::LOAD, MVT::f16, Promote); |
| AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); |
| setOperationAction(ISD::STORE, MVT::f16, Promote); |
| AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); |
| |
| // F16 - VOP1 Actions. |
| setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); |
| setOperationAction(ISD::FCOS, MVT::f16, Custom); |
| setOperationAction(ISD::FSIN, MVT::f16, Custom); |
| |
| setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); |
| |
| setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); |
| setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); |
| setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); |
| setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); |
| setOperationAction(ISD::FROUND, MVT::f16, Custom); |
| |
| // F16 - VOP2 Actions. |
| setOperationAction(ISD::BR_CC, MVT::f16, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); |
| |
| setOperationAction(ISD::FDIV, MVT::f16, Custom); |
| |
| // F16 - VOP3 Actions. |
| setOperationAction(ISD::FMA, MVT::f16, Legal); |
| if (STI.hasMadF16()) |
| setOperationAction(ISD::FMAD, MVT::f16, Legal); |
| |
| for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { |
| for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
| switch (Op) { |
| case ISD::LOAD: |
| case ISD::STORE: |
| case ISD::BUILD_VECTOR: |
| case ISD::BITCAST: |
| case ISD::EXTRACT_VECTOR_ELT: |
| case ISD::INSERT_VECTOR_ELT: |
| case ISD::INSERT_SUBVECTOR: |
| case ISD::EXTRACT_SUBVECTOR: |
| case ISD::SCALAR_TO_VECTOR: |
| break; |
| case ISD::CONCAT_VECTORS: |
| setOperationAction(Op, VT, Custom); |
| break; |
| default: |
| setOperationAction(Op, VT, Expand); |
| break; |
| } |
| } |
| } |
| |
| // v_perm_b32 can handle either of these. |
| setOperationAction(ISD::BSWAP, MVT::i16, Legal); |
| setOperationAction(ISD::BSWAP, MVT::v2i16, Legal); |
| setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); |
| |
| // XXX - Do these do anything? Vector constants turn into build_vector. |
| setOperationAction(ISD::Constant, MVT::v2i16, Legal); |
| setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::UNDEF, MVT::v2i16, Legal); |
| setOperationAction(ISD::UNDEF, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::STORE, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); |
| setOperationAction(ISD::STORE, MVT::v2f16, Promote); |
| AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); |
| |
| setOperationAction(ISD::LOAD, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); |
| setOperationAction(ISD::LOAD, MVT::v2f16, Promote); |
| AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); |
| |
| setOperationAction(ISD::AND, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); |
| setOperationAction(ISD::OR, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); |
| setOperationAction(ISD::XOR, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); |
| |
| setOperationAction(ISD::LOAD, MVT::v4i16, Promote); |
| AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); |
| setOperationAction(ISD::LOAD, MVT::v4f16, Promote); |
| AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); |
| |
| setOperationAction(ISD::STORE, MVT::v4i16, Promote); |
| AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); |
| setOperationAction(ISD::STORE, MVT::v4f16, Promote); |
| AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); |
| |
| setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); |
| setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); |
| |
| setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); |
| |
| if (!Subtarget->hasVOP3PInsts()) { |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); |
| } |
| |
| setOperationAction(ISD::FNEG, MVT::v2f16, Legal); |
| // This isn't really legal, but this avoids the legalizer unrolling it (and |
| // allows matching fneg (fabs x) patterns) |
| setOperationAction(ISD::FABS, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::FMAXNUM, MVT::f16, Custom); |
| setOperationAction(ISD::FMINNUM, MVT::f16, Custom); |
| setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal); |
| setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal); |
| |
| setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); |
| setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); |
| |
| setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); |
| setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); |
| } |
| |
| if (Subtarget->hasVOP3PInsts()) { |
| setOperationAction(ISD::ADD, MVT::v2i16, Legal); |
| setOperationAction(ISD::SUB, MVT::v2i16, Legal); |
| setOperationAction(ISD::MUL, MVT::v2i16, Legal); |
| setOperationAction(ISD::SHL, MVT::v2i16, Legal); |
| setOperationAction(ISD::SRL, MVT::v2i16, Legal); |
| setOperationAction(ISD::SRA, MVT::v2i16, Legal); |
| setOperationAction(ISD::SMIN, MVT::v2i16, Legal); |
| setOperationAction(ISD::UMIN, MVT::v2i16, Legal); |
| setOperationAction(ISD::SMAX, MVT::v2i16, Legal); |
| setOperationAction(ISD::UMAX, MVT::v2i16, Legal); |
| |
| setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal); |
| setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal); |
| setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal); |
| setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal); |
| |
| setOperationAction(ISD::FADD, MVT::v2f16, Legal); |
| setOperationAction(ISD::FMUL, MVT::v2f16, Legal); |
| setOperationAction(ISD::FMA, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal); |
| setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); |
| |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); |
| |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); |
| |
| setOperationAction(ISD::SHL, MVT::v4i16, Custom); |
| setOperationAction(ISD::SRA, MVT::v4i16, Custom); |
| setOperationAction(ISD::SRL, MVT::v4i16, Custom); |
| setOperationAction(ISD::ADD, MVT::v4i16, Custom); |
| setOperationAction(ISD::SUB, MVT::v4i16, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i16, Custom); |
| |
| setOperationAction(ISD::SMIN, MVT::v4i16, Custom); |
| setOperationAction(ISD::SMAX, MVT::v4i16, Custom); |
| setOperationAction(ISD::UMIN, MVT::v4i16, Custom); |
| setOperationAction(ISD::UMAX, MVT::v4i16, Custom); |
| |
| setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); |
| setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); |
| setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); |
| setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); |
| |
| setOperationAction(ISD::FADD, MVT::v4f16, Custom); |
| setOperationAction(ISD::FMUL, MVT::v4f16, Custom); |
| setOperationAction(ISD::FMA, MVT::v4f16, Custom); |
| |
| setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); |
| setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); |
| |
| setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); |
| setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); |
| setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); |
| |
| setOperationAction(ISD::FEXP, MVT::v2f16, Custom); |
| setOperationAction(ISD::SELECT, MVT::v4i16, Custom); |
| setOperationAction(ISD::SELECT, MVT::v4f16, Custom); |
| |
| if (Subtarget->hasPackedFP32Ops()) { |
| setOperationAction(ISD::FADD, MVT::v2f32, Legal); |
| setOperationAction(ISD::FMUL, MVT::v2f32, Legal); |
| setOperationAction(ISD::FMA, MVT::v2f32, Legal); |
| setOperationAction(ISD::FNEG, MVT::v2f32, Legal); |
| |
| for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) { |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::FMUL, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Custom); |
| } |
| } |
| } |
| |
| setOperationAction(ISD::FNEG, MVT::v4f16, Custom); |
| setOperationAction(ISD::FABS, MVT::v4f16, Custom); |
| |
| if (Subtarget->has16BitInsts()) { |
| setOperationAction(ISD::SELECT, MVT::v2i16, Promote); |
| AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); |
| setOperationAction(ISD::SELECT, MVT::v2f16, Promote); |
| AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); |
| } else { |
| // Legalization hack. |
| setOperationAction(ISD::SELECT, MVT::v2i16, Custom); |
| setOperationAction(ISD::SELECT, MVT::v2f16, Custom); |
| |
| setOperationAction(ISD::FNEG, MVT::v2f16, Custom); |
| setOperationAction(ISD::FABS, MVT::v2f16, Custom); |
| } |
| |
| for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { |
| setOperationAction(ISD::SELECT, VT, Custom); |
| } |
| |
| setOperationAction(ISD::SMULO, MVT::i64, Custom); |
| setOperationAction(ISD::UMULO, MVT::i64, Custom); |
| |
| if (Subtarget->hasMad64_32()) { |
| setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom); |
| setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom); |
| } |
| |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); |
| |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); |
| |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); |
| |
| setTargetDAGCombine(ISD::ADD); |
| setTargetDAGCombine(ISD::ADDCARRY); |
| setTargetDAGCombine(ISD::SUB); |
| setTargetDAGCombine(ISD::SUBCARRY); |
| setTargetDAGCombine(ISD::FADD); |
| setTargetDAGCombine(ISD::FSUB); |
| setTargetDAGCombine(ISD::FMINNUM); |
| setTargetDAGCombine(ISD::FMAXNUM); |
| setTargetDAGCombine(ISD::FMINNUM_IEEE); |
| setTargetDAGCombine(ISD::FMAXNUM_IEEE); |
| setTargetDAGCombine(ISD::FMA); |
| setTargetDAGCombine(ISD::SMIN); |
| setTargetDAGCombine(ISD::SMAX); |
| setTargetDAGCombine(ISD::UMIN); |
| setTargetDAGCombine(ISD::UMAX); |
| setTargetDAGCombine(ISD::SETCC); |
| setTargetDAGCombine(ISD::AND); |
| setTargetDAGCombine(ISD::OR); |
| setTargetDAGCombine(ISD::XOR); |
| setTargetDAGCombine(ISD::SINT_TO_FP); |
| setTargetDAGCombine(ISD::UINT_TO_FP); |
| setTargetDAGCombine(ISD::FCANONICALIZE); |
| setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); |
| setTargetDAGCombine(ISD::ZERO_EXTEND); |
| setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); |
| setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); |
| setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); |
| |
| // All memory operations. Some folding on the pointer operand is done to help |
| // matching the constant offsets in the addressing modes. |
| setTargetDAGCombine(ISD::LOAD); |
| setTargetDAGCombine(ISD::STORE); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD); |
| setTargetDAGCombine(ISD::ATOMIC_STORE); |
| setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); |
| setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); |
| setTargetDAGCombine(ISD::ATOMIC_SWAP); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); |
| setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); |
| setTargetDAGCombine(ISD::INTRINSIC_VOID); |
| setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); |
| |
| // FIXME: In other contexts we pretend this is a per-function property. |
| setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); |
| |
| setSchedulingPreference(Sched::RegPressure); |
| } |
| |
| const GCNSubtarget *SITargetLowering::getSubtarget() const { |
| return Subtarget; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // TargetLowering queries |
| //===----------------------------------------------------------------------===// |
| |
| // v_mad_mix* support a conversion from f16 to f32. |
| // |
| // There is only one special case when denormals are enabled we don't currently, |
| // where this is OK to use. |
| bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, |
| EVT DestVT, EVT SrcVT) const { |
| return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || |
| (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && |
| DestVT.getScalarType() == MVT::f32 && |
| SrcVT.getScalarType() == MVT::f16 && |
| // TODO: This probably only requires no input flushing? |
| !hasFP32Denormals(DAG.getMachineFunction()); |
| } |
| |
| bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, |
| LLT DestTy, LLT SrcTy) const { |
| return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || |
| (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && |
| DestTy.getScalarSizeInBits() == 32 && |
| SrcTy.getScalarSizeInBits() == 16 && |
| // TODO: This probably only requires no input flushing? |
| !hasFP32Denormals(*MI.getMF()); |
| } |
| |
| bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { |
| // SI has some legal vector types, but no legal vector operations. Say no |
| // shuffles are legal in order to prefer scalarizing some vector operations. |
| return false; |
| } |
| |
| MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, |
| CallingConv::ID CC, |
| EVT VT) const { |
| if (CC == CallingConv::AMDGPU_KERNEL) |
| return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
| |
| if (VT.isVector()) { |
| EVT ScalarVT = VT.getScalarType(); |
| unsigned Size = ScalarVT.getSizeInBits(); |
| if (Size == 16) { |
| if (Subtarget->has16BitInsts()) |
| return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
| return VT.isInteger() ? MVT::i32 : MVT::f32; |
| } |
| |
| if (Size < 16) |
| return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; |
| return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; |
| } |
| |
| if (VT.getSizeInBits() > 32) |
| return MVT::i32; |
| |
| return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
| } |
| |
| unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, |
| CallingConv::ID CC, |
| EVT VT) const { |
| if (CC == CallingConv::AMDGPU_KERNEL) |
| return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
| |
| if (VT.isVector()) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| EVT ScalarVT = VT.getScalarType(); |
| unsigned Size = ScalarVT.getSizeInBits(); |
| |
| // FIXME: Should probably promote 8-bit vectors to i16. |
| if (Size == 16 && Subtarget->has16BitInsts()) |
| return (NumElts + 1) / 2; |
| |
| if (Size <= 32) |
| return NumElts; |
| |
| if (Size > 32) |
| return NumElts * ((Size + 31) / 32); |
| } else if (VT.getSizeInBits() > 32) |
| return (VT.getSizeInBits() + 31) / 32; |
| |
| return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
| } |
| |
| unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( |
| LLVMContext &Context, CallingConv::ID CC, |
| EVT VT, EVT &IntermediateVT, |
| unsigned &NumIntermediates, MVT &RegisterVT) const { |
| if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| EVT ScalarVT = VT.getScalarType(); |
| unsigned Size = ScalarVT.getSizeInBits(); |
| // FIXME: We should fix the ABI to be the same on targets without 16-bit |
| // support, but unless we can properly handle 3-vectors, it will be still be |
| // inconsistent. |
| if (Size == 16 && Subtarget->has16BitInsts()) { |
| RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
| IntermediateVT = RegisterVT; |
| NumIntermediates = (NumElts + 1) / 2; |
| return NumIntermediates; |
| } |
| |
| if (Size == 32) { |
| RegisterVT = ScalarVT.getSimpleVT(); |
| IntermediateVT = RegisterVT; |
| NumIntermediates = NumElts; |
| return NumIntermediates; |
| } |
| |
| if (Size < 16 && Subtarget->has16BitInsts()) { |
| // FIXME: Should probably form v2i16 pieces |
| RegisterVT = MVT::i16; |
| IntermediateVT = ScalarVT; |
| NumIntermediates = NumElts; |
| return NumIntermediates; |
| } |
| |
| |
| if (Size != 16 && Size <= 32) { |
| RegisterVT = MVT::i32; |
| IntermediateVT = ScalarVT; |
| NumIntermediates = NumElts; |
| return NumIntermediates; |
| } |
| |
| if (Size > 32) { |
| RegisterVT = MVT::i32; |
| IntermediateVT = RegisterVT; |
| NumIntermediates = NumElts * ((Size + 31) / 32); |
| return NumIntermediates; |
| } |
| } |
| |
| return TargetLowering::getVectorTypeBreakdownForCallingConv( |
| Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); |
| } |
| |
| static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) { |
| assert(DMaskLanes != 0); |
| |
| if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { |
| unsigned NumElts = std::min(DMaskLanes, VT->getNumElements()); |
| return EVT::getVectorVT(Ty->getContext(), |
| EVT::getEVT(VT->getElementType()), |
| NumElts); |
| } |
| |
| return EVT::getEVT(Ty); |
| } |
| |
| // Peek through TFE struct returns to only use the data size. |
| static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) { |
| auto *ST = dyn_cast<StructType>(Ty); |
| if (!ST) |
| return memVTFromImageData(Ty, DMaskLanes); |
| |
| // Some intrinsics return an aggregate type - special case to work out the |
| // correct memVT. |
| // |
| // Only limited forms of aggregate type currently expected. |
| if (ST->getNumContainedTypes() != 2 || |
| !ST->getContainedType(1)->isIntegerTy(32)) |
| return EVT(); |
| return memVTFromImageData(ST->getContainedType(0), DMaskLanes); |
| } |
| |
| bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
| const CallInst &CI, |
| MachineFunction &MF, |
| unsigned IntrID) const { |
| if (const AMDGPU::RsrcIntrinsic *RsrcIntr = |
| AMDGPU::lookupRsrcIntrinsic(IntrID)) { |
| AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), |
| (Intrinsic::ID)IntrID); |
| if (Attr.hasFnAttr(Attribute::ReadNone)) |
| return false; |
| |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| if (RsrcIntr->IsImage) { |
| Info.ptrVal = |
| MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); |
| Info.align.reset(); |
| } else { |
| Info.ptrVal = |
| MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); |
| } |
| |
| Info.flags = MachineMemOperand::MODereferenceable; |
| if (Attr.hasFnAttr(Attribute::ReadOnly)) { |
| unsigned DMaskLanes = 4; |
| |
| if (RsrcIntr->IsImage) { |
| const AMDGPU::ImageDimIntrinsicInfo *Intr |
| = AMDGPU::getImageDimIntrinsicInfo(IntrID); |
| const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
| AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); |
| |
| if (!BaseOpcode->Gather4) { |
| // If this isn't a gather, we may have excess loaded elements in the |
| // IR type. Check the dmask for the real number of elements loaded. |
| unsigned DMask |
| = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); |
| DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); |
| } |
| |
| Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes); |
| } else |
| Info.memVT = EVT::getEVT(CI.getType()); |
| |
| // FIXME: What does alignment mean for an image? |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.flags |= MachineMemOperand::MOLoad; |
| } else if (Attr.hasFnAttr(Attribute::WriteOnly)) { |
| Info.opc = ISD::INTRINSIC_VOID; |
| |
| Type *DataTy = CI.getArgOperand(0)->getType(); |
| if (RsrcIntr->IsImage) { |
| unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); |
| unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); |
| Info.memVT = memVTFromImageData(DataTy, DMaskLanes); |
| } else |
| Info.memVT = EVT::getEVT(DataTy); |
| |
| Info.flags |= MachineMemOperand::MOStore; |
| } else { |
| // Atomic |
| Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : |
| ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); |
| Info.flags = MachineMemOperand::MOLoad | |
| MachineMemOperand::MOStore | |
| MachineMemOperand::MODereferenceable; |
| |
| // XXX - Should this be volatile without known ordering? |
| Info.flags |= MachineMemOperand::MOVolatile; |
| } |
| return true; |
| } |
| |
| switch (IntrID) { |
| case Intrinsic::amdgcn_atomic_inc: |
| case Intrinsic::amdgcn_atomic_dec: |
| case Intrinsic::amdgcn_ds_ordered_add: |
| case Intrinsic::amdgcn_ds_ordered_swap: |
| case Intrinsic::amdgcn_ds_fadd: |
| case Intrinsic::amdgcn_ds_fmin: |
| case Intrinsic::amdgcn_ds_fmax: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getType()); |
| Info.ptrVal = CI.getOperand(0); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| |
| const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); |
| if (!Vol->isZero()) |
| Info.flags |= MachineMemOperand::MOVolatile; |
| |
| return true; |
| } |
| case Intrinsic::amdgcn_buffer_atomic_fadd: { |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); |
| Info.ptrVal = |
| MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| |
| const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); |
| if (!Vol || !Vol->isZero()) |
| Info.flags |= MachineMemOperand::MOVolatile; |
| |
| return true; |
| } |
| case Intrinsic::amdgcn_ds_append: |
| case Intrinsic::amdgcn_ds_consume: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getType()); |
| Info.ptrVal = CI.getOperand(0); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| |
| const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); |
| if (!Vol->isZero()) |
| Info.flags |= MachineMemOperand::MOVolatile; |
| |
| return true; |
| } |
| case Intrinsic::amdgcn_global_atomic_csub: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getType()); |
| Info.ptrVal = CI.getOperand(0); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | |
| MachineMemOperand::MOStore | |
| MachineMemOperand::MOVolatile; |
| return true; |
| } |
| case Intrinsic::amdgcn_image_bvh_intersect_ray: { |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? |
| Info.ptrVal = |
| MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | |
| MachineMemOperand::MODereferenceable; |
| return true; |
| } |
| case Intrinsic::amdgcn_global_atomic_fadd: |
| case Intrinsic::amdgcn_global_atomic_fmin: |
| case Intrinsic::amdgcn_global_atomic_fmax: |
| case Intrinsic::amdgcn_flat_atomic_fadd: |
| case Intrinsic::amdgcn_flat_atomic_fmin: |
| case Intrinsic::amdgcn_flat_atomic_fmax: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(CI.getType()); |
| Info.ptrVal = CI.getOperand(0); |
| Info.align.reset(); |
| Info.flags = MachineMemOperand::MOLoad | |
| MachineMemOperand::MOStore | |
| MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOVolatile; |
| return true; |
| } |
| case Intrinsic::amdgcn_ds_gws_init: |
| case Intrinsic::amdgcn_ds_gws_barrier: |
| case Intrinsic::amdgcn_ds_gws_sema_v: |
| case Intrinsic::amdgcn_ds_gws_sema_br: |
| case Intrinsic::amdgcn_ds_gws_sema_p: |
| case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
| Info.opc = ISD::INTRINSIC_VOID; |
| |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| Info.ptrVal = |
| MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); |
| |
| // This is an abstract access, but we need to specify a type and size. |
| Info.memVT = MVT::i32; |
| Info.size = 4; |
| Info.align = Align(4); |
| |
| Info.flags = MachineMemOperand::MOStore; |
| if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, |
| SmallVectorImpl<Value*> &Ops, |
| Type *&AccessTy) const { |
| switch (II->getIntrinsicID()) { |
| case Intrinsic::amdgcn_atomic_inc: |
| case Intrinsic::amdgcn_atomic_dec: |
| case Intrinsic::amdgcn_ds_ordered_add: |
| case Intrinsic::amdgcn_ds_ordered_swap: |
| case Intrinsic::amdgcn_ds_append: |
| case Intrinsic::amdgcn_ds_consume: |
| case Intrinsic::amdgcn_ds_fadd: |
| case Intrinsic::amdgcn_ds_fmin: |
| case Intrinsic::amdgcn_ds_fmax: |
| case Intrinsic::amdgcn_global_atomic_fadd: |
| case Intrinsic::amdgcn_flat_atomic_fadd: |
| case Intrinsic::amdgcn_flat_atomic_fmin: |
| case Intrinsic::amdgcn_flat_atomic_fmax: |
| case Intrinsic::amdgcn_global_atomic_csub: { |
| Value *Ptr = II->getArgOperand(0); |
| AccessTy = II->getType(); |
| Ops.push_back(Ptr); |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { |
| if (!Subtarget->hasFlatInstOffsets()) { |
| // Flat instructions do not have offsets, and only have the register |
| // address. |
| return AM.BaseOffs == 0 && AM.Scale == 0; |
| } |
| |
| return AM.Scale == 0 && |
| (AM.BaseOffs == 0 || |
| Subtarget->getInstrInfo()->isLegalFLATOffset( |
| AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); |
| } |
| |
| bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { |
| if (Subtarget->hasFlatGlobalInsts()) |
| return AM.Scale == 0 && |
| (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( |
| AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, |
| SIInstrFlags::FlatGlobal)); |
| |
| if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { |
| // Assume the we will use FLAT for all global memory accesses |
| // on VI. |
| // FIXME: This assumption is currently wrong. On VI we still use |
| // MUBUF instructions for the r + i addressing mode. As currently |
| // implemented, the MUBUF instructions only work on buffer < 4GB. |
| // It may be possible to support > 4GB buffers with MUBUF instructions, |
| // by setting the stride value in the resource descriptor which would |
| // increase the size limit to (stride * 4GB). However, this is risky, |
| // because it has never been validated. |
| return isLegalFlatAddressingMode(AM); |
| } |
| |
| return isLegalMUBUFAddressingMode(AM); |
| } |
| |
| bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { |
| // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and |
| // additionally can do r + r + i with addr64. 32-bit has more addressing |
| // mode options. Depending on the resource constant, it can also do |
| // (i64 r0) + (i32 r1) * (i14 i). |
| // |
| // Private arrays end up using a scratch buffer most of the time, so also |
| // assume those use MUBUF instructions. Scratch loads / stores are currently |
| // implemented as mubuf instructions with offen bit set, so slightly |
| // different than the normal addr64. |
| if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs)) |
| return false; |
| |
| // FIXME: Since we can split immediate into soffset and immediate offset, |
| // would it make sense to allow any immediate? |
| |
| switch (AM.Scale) { |
| case 0: // r + i or just i, depending on HasBaseReg. |
| return true; |
| case 1: |
| return true; // We have r + r or r + i. |
| case 2: |
| if (AM.HasBaseReg) { |
| // Reject 2 * r + r. |
| return false; |
| } |
| |
| // Allow 2 * r as r + r |
| // Or 2 * r + i is allowed as r + r + i. |
| return true; |
| default: // Don't allow n * r |
| return false; |
| } |
| } |
| |
| bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, |
| const AddrMode &AM, Type *Ty, |
| unsigned AS, Instruction *I) const { |
| // No global is ever allowed as a base. |
| if (AM.BaseGV) |
| return false; |
| |
| if (AS == AMDGPUAS::GLOBAL_ADDRESS) |
| return isLegalGlobalAddressingMode(AM); |
| |
| if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
| AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
| AS == AMDGPUAS::BUFFER_FAT_POINTER) { |
| // If the offset isn't a multiple of 4, it probably isn't going to be |
| // correctly aligned. |
| // FIXME: Can we get the real alignment here? |
| if (AM.BaseOffs % 4 != 0) |
| return isLegalMUBUFAddressingMode(AM); |
| |
| // There are no SMRD extloads, so if we have to do a small type access we |
| // will use a MUBUF load. |
| // FIXME?: We also need to do this if unaligned, but we don't know the |
| // alignment here. |
| if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) |
| return isLegalGlobalAddressingMode(AM); |
| |
| if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { |
| // SMRD instructions have an 8-bit, dword offset on SI. |
| if (!isUInt<8>(AM.BaseOffs / 4)) |
| return false; |
| } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { |
| // On CI+, this can also be a 32-bit literal constant offset. If it fits |
| // in 8-bits, it can use a smaller encoding. |
| if (!isUInt<32>(AM.BaseOffs / 4)) |
| return false; |
| } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
| // On VI, these use the SMEM format and the offset is 20-bit in bytes. |
| if (!isUInt<20>(AM.BaseOffs)) |
| return false; |
| } else |
| llvm_unreachable("unhandled generation"); |
| |
| if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
| return true; |
| |
| if (AM.Scale == 1 && AM.HasBaseReg) |
| return true; |
| |
| return false; |
| |
| } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
| return isLegalMUBUFAddressingMode(AM); |
| } else if (AS == AMDGPUAS::LOCAL_ADDRESS || |
| AS == AMDGPUAS::REGION_ADDRESS) { |
| // Basic, single offset DS instructions allow a 16-bit unsigned immediate |
| // field. |
| // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have |
| // an 8-bit dword offset but we don't know the alignment here. |
| if (!isUInt<16>(AM.BaseOffs)) |
| return false; |
| |
| if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
| return true; |
| |
| if (AM.Scale == 1 && AM.HasBaseReg) |
| return true; |
| |
| return false; |
| } else if (AS == AMDGPUAS::FLAT_ADDRESS || |
| AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { |
| // For an unknown address space, this usually means that this is for some |
| // reason being used for pure arithmetic, and not based on some addressing |
| // computation. We don't have instructions that compute pointers with any |
| // addressing modes, so treat them as having no offset like flat |
| // instructions. |
| return isLegalFlatAddressingMode(AM); |
| } |
| |
| // Assume a user alias of global for unknown address spaces. |
| return isLegalGlobalAddressingMode(AM); |
| } |
| |
| bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, |
| const MachineFunction &MF) const { |
| if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { |
| return (MemVT.getSizeInBits() <= 4 * 32); |
| } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
| unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); |
| return (MemVT.getSizeInBits() <= MaxPrivateBits); |
| } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
| return (MemVT.getSizeInBits() <= 2 * 32); |
| } |
| return true; |
| } |
| |
| bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( |
| unsigned Size, unsigned AddrSpace, Align Alignment, |
| MachineMemOperand::Flags Flags, bool *IsFast) const { |
| if (IsFast) |
| *IsFast = false; |
| |
| if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
| AddrSpace == AMDGPUAS::REGION_ADDRESS) { |
| // Check if alignment requirements for ds_read/write instructions are |
| // disabled. |
| if (Subtarget->hasUnalignedDSAccessEnabled() && |
| !Subtarget->hasLDSMisalignedBug()) { |
| if (IsFast) |
| *IsFast = Alignment != Align(2); |
| return true; |
| } |
| |
| // Either, the alignment requirements are "enabled", or there is an |
| // unaligned LDS access related hardware bug though alignment requirements |
| // are "disabled". In either case, we need to check for proper alignment |
| // requirements. |
| // |
| if (Size == 64) { |
| // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we |
| // can do a 4 byte aligned, 8 byte access in a single operation using |
| // ds_read2/write2_b32 with adjacent offsets. |
| bool AlignedBy4 = Alignment >= Align(4); |
| if (IsFast) |
| *IsFast = AlignedBy4; |
| |
| return AlignedBy4; |
| } |
| if (Size == 96) { |
| // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on |
| // gfx8 and older. |
| bool AlignedBy16 = Alignment >= Align(16); |
| if (IsFast) |
| *IsFast = AlignedBy16; |
| |
| return AlignedBy16; |
| } |
| if (Size == 128) { |
| // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on |
| // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a |
| // single operation using ds_read2/write2_b64. |
| bool AlignedBy8 = Alignment >= Align(8); |
| if (IsFast) |
| *IsFast = AlignedBy8; |
| |
| return AlignedBy8; |
| } |
| } |
| |
| if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
| bool AlignedBy4 = Alignment >= Align(4); |
| if (IsFast) |
| *IsFast = AlignedBy4; |
| |
| return AlignedBy4 || |
| Subtarget->enableFlatScratch() || |
| Subtarget->hasUnalignedScratchAccess(); |
| } |
| |
| // FIXME: We have to be conservative here and assume that flat operations |
| // will access scratch. If we had access to the IR function, then we |
| // could determine if any private memory was used in the function. |
| if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && |
| !Subtarget->hasUnalignedScratchAccess()) { |
| bool AlignedBy4 = Alignment >= Align(4); |
| if (IsFast) |
| *IsFast = AlignedBy4; |
| |
| return AlignedBy4; |
| } |
| |
| if (Subtarget->hasUnalignedBufferAccessEnabled() && |
| !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
| AddrSpace == AMDGPUAS::REGION_ADDRESS)) { |
| // If we have an uniform constant load, it still requires using a slow |
| // buffer instruction if unaligned. |
| if (IsFast) { |
| // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so |
| // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. |
| *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || |
| AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? |
| Alignment >= Align(4) : Alignment != Align(2); |
| } |
| |
| return true; |
| } |
| |
| // Smaller than dword value must be aligned. |
| if (Size < 32) |
| return false; |
| |
| // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the |
| // byte-address are ignored, thus forcing Dword alignment. |
| // This applies to private, global, and constant memory. |
| if (IsFast) |
| *IsFast = true; |
| |
| return Size >= 32 && Alignment >= Align(4); |
| } |
| |
| bool SITargetLowering::allowsMisalignedMemoryAccesses( |
| EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| bool *IsFast) const { |
| if (IsFast) |
| *IsFast = false; |
| |
| // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, |
| // which isn't a simple VT. |
| // Until MVT is extended to handle this, simply check for the size and |
| // rely on the condition below: allow accesses if the size is a multiple of 4. |
| if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && |
| VT.getStoreSize() > 16)) { |
| return false; |
| } |
| |
| return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, |
| Alignment, Flags, IsFast); |
| } |
| |
| EVT SITargetLowering::getOptimalMemOpType( |
| const MemOp &Op, const AttributeList &FuncAttributes) const { |
| // FIXME: Should account for address space here. |
| |
| // The default fallback uses the private pointer size as a guess for a type to |
| // use. Make sure we switch these to 64-bit accesses. |
| |
| if (Op.size() >= 16 && |
| Op.isDstAligned(Align(4))) // XXX: Should only do for global |
| return MVT::v4i32; |
| |
| if (Op.size() >= 8 && Op.isDstAligned(Align(4))) |
| return MVT::v2i32; |
| |
| // Use the default. |
| return MVT::Other; |
| } |
| |
| bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { |
| const MemSDNode *MemNode = cast<MemSDNode>(N); |
| const Value *Ptr = MemNode->getMemOperand()->getValue(); |
| const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); |
| return I && I->getMetadata("amdgpu.noclobber"); |
| } |
| |
| bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { |
| return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || |
| AS == AMDGPUAS::PRIVATE_ADDRESS; |
| } |
| |
| bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, |
| unsigned DestAS) const { |
| // Flat -> private/local is a simple truncate. |
| // Flat -> global is no-op |
| if (SrcAS == AMDGPUAS::FLAT_ADDRESS) |
| return true; |
| |
| const GCNTargetMachine &TM = |
| static_cast<const GCNTargetMachine &>(getTargetMachine()); |
| return TM.isNoopAddrSpaceCast(SrcAS, DestAS); |
| } |
| |
| bool SITargetLowering::isMemOpUniform(const SDNode *N) const { |
| const MemSDNode *MemNode = cast<MemSDNode>(N); |
| |
| return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); |
| } |
| |
| TargetLoweringBase::LegalizeTypeAction |
| SITargetLowering::getPreferredVectorAction(MVT VT) const { |
| if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && |
| VT.getScalarType().bitsLE(MVT::i16)) |
| return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; |
| return TargetLoweringBase::getPreferredVectorAction(VT); |
| } |
| |
| bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, |
| Type *Ty) const { |
| // FIXME: Could be smarter if called for vector constants. |
| return true; |
| } |
| |
| bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { |
| if (Subtarget->has16BitInsts() && VT == MVT::i16) { |
| switch (Op) { |
| case ISD::LOAD: |
| case ISD::STORE: |
| |
| // These operations are done with 32-bit instructions anyway. |
| case ISD::AND: |
| case ISD::OR: |
| case ISD::XOR: |
| case ISD::SELECT: |
| // TODO: Extensions? |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| // SimplifySetCC uses this function to determine whether or not it should |
| // create setcc with i1 operands. We don't have instructions for i1 setcc. |
| if (VT == MVT::i1 && Op == ISD::SETCC) |
| return false; |
| |
| return TargetLowering::isTypeDesirableForOp(Op, VT); |
| } |
| |
| SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, |
| const SDLoc &SL, |
| SDValue Chain, |
| uint64_t Offset) const { |
| const DataLayout &DL = DAG.getDataLayout(); |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| const ArgDescriptor *InputPtrReg; |
| const TargetRegisterClass *RC; |
| LLT ArgTy; |
| MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); |
| |
| std::tie(InputPtrReg, RC, ArgTy) = |
| Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
| |
| // We may not have the kernarg segment argument if we have no kernel |
| // arguments. |
| if (!InputPtrReg) |
| return DAG.getConstant(0, SL, PtrVT); |
| |
| MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
| SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, |
| MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); |
| |
| return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); |
| } |
| |
| SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, |
| const SDLoc &SL) const { |
| uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(), |
| FIRST_IMPLICIT); |
| return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); |
| } |
| |
| SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, |
| const SDLoc &SL, SDValue Val, |
| bool Signed, |
| const ISD::InputArg *Arg) const { |
| // First, if it is a widened vector, narrow it. |
| if (VT.isVector() && |
| VT.getVectorNumElements() != MemVT.getVectorNumElements()) { |
| EVT NarrowedVT = |
| EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), |
| VT.getVectorNumElements()); |
| Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, |
| DAG.getConstant(0, SL, MVT::i32)); |
| } |
| |
| // Then convert the vector elements or scalar value. |
| if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && |
| VT.bitsLT(MemVT)) { |
| unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; |
| Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); |
| } |
| |
| if (MemVT.isFloatingPoint()) |
| Val = getFPExtOrFPRound(DAG, Val, SL, VT); |
| else if (Signed) |
| Val = DAG.getSExtOrTrunc(Val, SL, VT); |
| else |
| Val = DAG.getZExtOrTrunc(Val, SL, VT); |
| |
| return Val; |
| } |
| |
| SDValue SITargetLowering::lowerKernargMemParameter( |
| SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, |
| uint64_t Offset, Align Alignment, bool Signed, |
| const ISD::InputArg *Arg) const { |
| MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
| |
| // Try to avoid using an extload by loading earlier than the argument address, |
| // and extracting the relevant bits. The load should hopefully be merged with |
| // the previous argument. |
| if (MemVT.getStoreSize() < 4 && Alignment < 4) { |
| // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). |
| int64_t AlignDownOffset = alignDown(Offset, 4); |
| int64_t OffsetDiff = Offset - AlignDownOffset; |
| |
| EVT IntVT = MemVT.changeTypeToInteger(); |
| |
| // TODO: If we passed in the base kernel offset we could have a better |
| // alignment than 4, but we don't really need it. |
| SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); |
| SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), |
| MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant); |
| |
| SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); |
| SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); |
| |
| SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); |
| ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); |
| ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); |
| |
| |
| return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); |
| } |
| |
| SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); |
| SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, |
| MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant); |
| |
| SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); |
| return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); |
| } |
| |
| SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, |
| const SDLoc &SL, SDValue Chain, |
| const ISD::InputArg &Arg) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| |
| if (Arg.Flags.isByVal()) { |
| unsigned Size = Arg.Flags.getByValSize(); |
| int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); |
| return DAG.getFrameIndex(FrameIdx, MVT::i32); |
| } |
| |
| unsigned ArgOffset = VA.getLocMemOffset(); |
| unsigned ArgSize = VA.getValVT().getStoreSize(); |
| |
| int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); |
| |
| // Create load nodes to retrieve arguments from the stack. |
| SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); |
| SDValue ArgValue; |
| |
| // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) |
| ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; |
| MVT MemVT = VA.getValVT(); |
| |
| switch (VA.getLocInfo()) { |
| default: |
| break; |
| case CCValAssign::BCvt: |
| MemVT = VA.getLocVT(); |
| break; |
| case CCValAssign::SExt: |
| ExtType = ISD::SEXTLOAD; |
| break; |
| case CCValAssign::ZExt: |
| ExtType = ISD::ZEXTLOAD; |
| break; |
| case CCValAssign::AExt: |
| ExtType = ISD::EXTLOAD; |
| break; |
| } |
| |
| ArgValue = DAG.getExtLoad( |
| ExtType, SL, VA.getLocVT(), Chain, FIN, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), |
| MemVT); |
| return ArgValue; |
| } |
| |
| SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, |
| const SIMachineFunctionInfo &MFI, |
| EVT VT, |
| AMDGPUFunctionArgInfo::PreloadedValue PVID) const { |
| const ArgDescriptor *Reg; |
| const TargetRegisterClass *RC; |
| LLT Ty; |
| |
| std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); |
| if (!Reg) { |
| if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { |
| // It's possible for a kernarg intrinsic call to appear in a kernel with |
| // no allocated segment, in which case we do not add the user sgpr |
| // argument, so just return null. |
| return DAG.getConstant(0, SDLoc(), VT); |
| } |
| |
| // It's undefined behavior if a function marked with the amdgpu-no-* |
| // attributes uses the corresponding intrinsic. |
| return DAG.getUNDEF(VT); |
| } |
| |
| return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); |
| } |
| |
| static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, |
| CallingConv::ID CallConv, |
| ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, |
| FunctionType *FType, |
| SIMachineFunctionInfo *Info) { |
| for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { |
| const ISD::InputArg *Arg = &Ins[I]; |
| |
| assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && |
| "vector type argument should have been split"); |
| |
| // First check if it's a PS input addr. |
| if (CallConv == CallingConv::AMDGPU_PS && |
| !Arg->Flags.isInReg() && PSInputNum <= 15) { |
| bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); |
| |
| // Inconveniently only the first part of the split is marked as isSplit, |
| // so skip to the end. We only want to increment PSInputNum once for the |
| // entire split argument. |
| if (Arg->Flags.isSplit()) { |
| while (!Arg->Flags.isSplitEnd()) { |
| assert((!Arg->VT.isVector() || |
| Arg->VT.getScalarSizeInBits() == 16) && |
| "unexpected vector split in ps argument type"); |
| if (!SkipArg) |
| Splits.push_back(*Arg); |
| Arg = &Ins[++I]; |
| } |
| } |
| |
| if (SkipArg) { |
| // We can safely skip PS inputs. |
| Skipped.set(Arg->getOrigArgIndex()); |
| ++PSInputNum; |
| continue; |
| } |
| |
| Info->markPSInputAllocated(PSInputNum); |
| if (Arg->Used) |
| Info->markPSInputEnabled(PSInputNum); |
| |
| ++PSInputNum; |
| } |
| |
| Splits.push_back(*Arg); |
| } |
| } |
| |
| // Allocate special inputs passed in VGPRs. |
| void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, |
| MachineFunction &MF, |
| const SIRegisterInfo &TRI, |
| SIMachineFunctionInfo &Info) const { |
| const LLT S32 = LLT::scalar(32); |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| |
| if (Info.hasWorkItemIDX()) { |
| Register Reg = AMDGPU::VGPR0; |
| MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
| |
| CCInfo.AllocateReg(Reg); |
| unsigned Mask = (Subtarget->hasPackedTID() && |
| Info.hasWorkItemIDY()) ? 0x3ff : ~0u; |
| Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
| } |
| |
| if (Info.hasWorkItemIDY()) { |
| assert(Info.hasWorkItemIDX()); |
| if (Subtarget->hasPackedTID()) { |
| Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, |
| 0x3ff << 10)); |
| } else { |
| unsigned Reg = AMDGPU::VGPR1; |
| MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
| |
| CCInfo.AllocateReg(Reg); |
| Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); |
| } |
| } |
| |
| if (Info.hasWorkItemIDZ()) { |
| assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); |
| if (Subtarget->hasPackedTID()) { |
| Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, |
| 0x3ff << 20)); |
| } else { |
| unsigned Reg = AMDGPU::VGPR2; |
| MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
| |
| CCInfo.AllocateReg(Reg); |
| Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); |
| } |
| } |
| } |
| |
| // Try to allocate a VGPR at the end of the argument list, or if no argument |
| // VGPRs are left allocating a stack slot. |
| // If \p Mask is is given it indicates bitfield position in the register. |
| // If \p Arg is given use it with new ]p Mask instead of allocating new. |
| static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, |
| ArgDescriptor Arg = ArgDescriptor()) { |
| if (Arg.isSet()) |
| return ArgDescriptor::createArg(Arg, Mask); |
| |
| ArrayRef<MCPhysReg> ArgVGPRs |
| = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); |
| unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); |
| if (RegIdx == ArgVGPRs.size()) { |
| // Spill to stack required. |
| int64_t Offset = CCInfo.AllocateStack(4, Align(4)); |
| |
| return ArgDescriptor::createStack(Offset, Mask); |
| } |
| |
| unsigned Reg = ArgVGPRs[RegIdx]; |
| Reg = CCInfo.AllocateReg(Reg); |
| assert(Reg != AMDGPU::NoRegister); |
| |
| MachineFunction &MF = CCInfo.getMachineFunction(); |
| Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); |
| MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); |
| return ArgDescriptor::createRegister(Reg, Mask); |
| } |
| |
| static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, |
| const TargetRegisterClass *RC, |
| unsigned NumArgRegs) { |
| ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32); |
| unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); |
| if (RegIdx == ArgSGPRs.size()) |
| report_fatal_error("ran out of SGPRs for arguments"); |
| |
| unsigned Reg = ArgSGPRs[RegIdx]; |
| Reg = CCInfo.AllocateReg(Reg); |
| assert(Reg != AMDGPU::NoRegister); |
| |
| MachineFunction &MF = CCInfo.getMachineFunction(); |
| MF.addLiveIn(Reg, RC); |
| return ArgDescriptor::createRegister(Reg); |
| } |
| |
| // If this has a fixed position, we still should allocate the register in the |
| // CCInfo state. Technically we could get away with this for values passed |
| // outside of the normal argument range. |
| static void allocateFixedSGPRInputImpl(CCState &CCInfo, |
| const TargetRegisterClass *RC, |
| MCRegister Reg) { |
| Reg = CCInfo.AllocateReg(Reg); |
| assert(Reg != AMDGPU::NoRegister); |
| MachineFunction &MF = CCInfo.getMachineFunction(); |
| MF.addLiveIn(Reg, RC); |
| } |
| |
| static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { |
| if (Arg) { |
| allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, |
| Arg.getRegister()); |
| } else |
| Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); |
| } |
| |
| static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { |
| if (Arg) { |
| allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, |
| Arg.getRegister()); |
| } else |
| Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); |
| } |
| |
| /// Allocate implicit function VGPR arguments at the end of allocated user |
| /// arguments. |
| void SITargetLowering::allocateSpecialInputVGPRs( |
| CCState &CCInfo, MachineFunction &MF, |
| const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
| const unsigned Mask = 0x3ff; |
| ArgDescriptor Arg; |
| |
| if (Info.hasWorkItemIDX()) { |
| Arg = allocateVGPR32Input(CCInfo, Mask); |
| Info.setWorkItemIDX(Arg); |
| } |
| |
| if (Info.hasWorkItemIDY()) { |
| Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); |
| Info.setWorkItemIDY(Arg); |
| } |
| |
| if (Info.hasWorkItemIDZ()) |
| Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); |
| } |
| |
| /// Allocate implicit function VGPR arguments in fixed registers. |
| void SITargetLowering::allocateSpecialInputVGPRsFixed( |
| CCState &CCInfo, MachineFunction &MF, |
| const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
| Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); |
| if (!Reg) |
| report_fatal_error("failed to allocated VGPR for implicit arguments"); |
| |
| const unsigned Mask = 0x3ff; |
| Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
| Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); |
| Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); |
| } |
| |
| void SITargetLowering::allocateSpecialInputSGPRs( |
| CCState &CCInfo, |
| MachineFunction &MF, |
| const SIRegisterInfo &TRI, |
| SIMachineFunctionInfo &Info) const { |
| auto &ArgInfo = Info.getArgInfo(); |
| |
| // We need to allocate these in place regardless of their use. |
| const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI; |
| |
| // TODO: Unify handling with private memory pointers. |
| if (IsFixed || Info.hasDispatchPtr()) |
| allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); |
| |
| if (IsFixed || Info.hasQueuePtr()) |
| allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); |
| |
| // Implicit arg ptr takes the place of the kernarg segment pointer. This is a |
| // constant offset from the kernarg segment. |
| if (IsFixed || Info.hasImplicitArgPtr()) |
| allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); |
| |
| if (IsFixed || Info.hasDispatchID()) |
| allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); |
| |
| // flat_scratch_init is not applicable for non-kernel functions. |
| |
| if (IsFixed || Info.hasWorkGroupIDX()) |
| allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); |
| |
| if (IsFixed || Info.hasWorkGroupIDY()) |
| allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); |
| |
| if (IsFixed || Info.hasWorkGroupIDZ()) |
| allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); |
| } |
| |
| // Allocate special inputs passed in user SGPRs. |
| void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, |
| MachineFunction &MF, |
| const SIRegisterInfo &TRI, |
| SIMachineFunctionInfo &Info) const { |
| if (Info.hasImplicitBufferPtr()) { |
| Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); |
| MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); |
| CCInfo.AllocateReg(ImplicitBufferPtrReg); |
| } |
| |
| // FIXME: How should these inputs interact with inreg / custom SGPR inputs? |
| if (Info.hasPrivateSegmentBuffer()) { |
| Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); |
| MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); |
| CCInfo.AllocateReg(PrivateSegmentBufferReg); |
| } |
| |
| if (Info.hasDispatchPtr()) { |
| Register DispatchPtrReg = Info.addDispatchPtr(TRI); |
| MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); |
| CCInfo.AllocateReg(DispatchPtrReg); |
| } |
| |
| if (Info.hasQueuePtr()) { |
| Register QueuePtrReg = Info.addQueuePtr(TRI); |
| MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); |
| CCInfo.AllocateReg(QueuePtrReg); |
| } |
| |
| if (Info.hasKernargSegmentPtr()) { |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| Register InputPtrReg = Info.addKernargSegmentPtr(TRI); |
| CCInfo.AllocateReg(InputPtrReg); |
| |
| Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); |
| MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); |
| } |
| |
| if (Info.hasDispatchID()) { |
| Register DispatchIDReg = Info.addDispatchID(TRI); |
| MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); |
| CCInfo.AllocateReg(DispatchIDReg); |
| } |
| |
| if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { |
| Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); |
| MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); |
| CCInfo.AllocateReg(FlatScratchInitReg); |
| } |
| |
| // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read |
| // these from the dispatch pointer. |
| } |
| |
| // Allocate special input registers that are initialized per-wave. |
| void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, |
| MachineFunction &MF, |
| SIMachineFunctionInfo &Info, |
| CallingConv::ID CallConv, |
| bool IsShader) const { |
| if (Info.hasWorkGroupIDX()) { |
| Register Reg = Info.addWorkGroupIDX(); |
| MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
| CCInfo.AllocateReg(Reg); |
| } |
| |
| if (Info.hasWorkGroupIDY()) { |
| Register Reg = Info.addWorkGroupIDY(); |
| MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
| CCInfo.AllocateReg(Reg); |
| } |
| |
| if (Info.hasWorkGroupIDZ()) { |
| Register Reg = Info.addWorkGroupIDZ(); |
| MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
| CCInfo.AllocateReg(Reg); |
| } |
| |
| if (Info.hasWorkGroupInfo()) { |
| Register Reg = Info.addWorkGroupInfo(); |
| MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
| CCInfo.AllocateReg(Reg); |
| } |
| |
| if (Info.hasPrivateSegmentWaveByteOffset()) { |
| // Scratch wave offset passed in system SGPR. |
| unsigned PrivateSegmentWaveByteOffsetReg; |
| |
| if (IsShader) { |
| PrivateSegmentWaveByteOffsetReg = |
| Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); |
| |
| // This is true if the scratch wave byte offset doesn't have a fixed |
| // location. |
| if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { |
| PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); |
| Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); |
| } |
| } else |
| PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); |
| |
| MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); |
| CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); |
| } |
| } |
| |
| static void reservePrivateMemoryRegs(const TargetMachine &TM, |
| MachineFunction &MF, |
| const SIRegisterInfo &TRI, |
| SIMachineFunctionInfo &Info) { |
| // Now that we've figured out where the scratch register inputs are, see if |
| // should reserve the arguments and use them directly. |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| bool HasStackObjects = MFI.hasStackObjects(); |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| |
| // Record that we know we have non-spill stack objects so we don't need to |
| // check all stack objects later. |
| if (HasStackObjects) |
| Info.setHasNonSpillStackObjects(true); |
| |
| // Everything live out of a block is spilled with fast regalloc, so it's |
| // almost certain that spilling will be required. |
| if (TM.getOptLevel() == CodeGenOpt::None) |
| HasStackObjects = true; |
| |
| // For now assume stack access is needed in any callee functions, so we need |
| // the scratch registers to pass in. |
| bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); |
| |
| if (!ST.enableFlatScratch()) { |
| if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { |
| // If we have stack objects, we unquestionably need the private buffer |
| // resource. For the Code Object V2 ABI, this will be the first 4 user |
| // SGPR inputs. We can reserve those and use them directly. |
| |
| Register PrivateSegmentBufferReg = |
| Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
| Info.setScratchRSrcReg(PrivateSegmentBufferReg); |
| } else { |
| unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); |
| // We tentatively reserve the last registers (skipping the last registers |
| // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, |
| // we'll replace these with the ones immediately after those which were |
| // really allocated. In the prologue copies will be inserted from the |
| // argument to these reserved registers. |
| |
| // Without HSA, relocations are used for the scratch pointer and the |
| // buffer resource setup is always inserted in the prologue. Scratch wave |
| // offset is still in an input SGPR. |
| Info.setScratchRSrcReg(ReservedBufferReg); |
| } |
| } |
| |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| |
| // For entry functions we have to set up the stack pointer if we use it, |
| // whereas non-entry functions get this "for free". This means there is no |
| // intrinsic advantage to using S32 over S34 in cases where we do not have |
| // calls but do need a frame pointer (i.e. if we are requested to have one |
| // because frame pointer elimination is disabled). To keep things simple we |
| // only ever use S32 as the call ABI stack pointer, and so using it does not |
| // imply we need a separate frame pointer. |
| // |
| // Try to use s32 as the SP, but move it if it would interfere with input |
| // arguments. This won't work with calls though. |
| // |
| // FIXME: Move SP to avoid any possible inputs, or find a way to spill input |
| // registers. |
| if (!MRI.isLiveIn(AMDGPU::SGPR32)) { |
| Info.setStackPtrOffsetReg(AMDGPU::SGPR32); |
| } else { |
| assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); |
| |
| if (MFI.hasCalls()) |
| report_fatal_error("call in graphics shader with too many input SGPRs"); |
| |
| for (unsigned Reg : AMDGPU::SGPR_32RegClass) { |
| if (!MRI.isLiveIn(Reg)) { |
| Info.setStackPtrOffsetReg(Reg); |
| break; |
| } |
| } |
| |
| if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) |
| report_fatal_error("failed to find register for SP"); |
| } |
| |
| // hasFP should be accurate for entry functions even before the frame is |
| // finalized, because it does not rely on the known stack size, only |
| // properties like whether variable sized objects are present. |
| if (ST.getFrameLowering()->hasFP(MF)) { |
| Info.setFrameOffsetReg(AMDGPU::SGPR33); |
| } |
| } |
| |
| bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { |
| const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
| return !Info->isEntryFunction(); |
| } |
| |
| void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { |
| |
| } |
| |
| void SITargetLowering::insertCopiesSplitCSR( |
| MachineBasicBlock *Entry, |
| const SmallVectorImpl<MachineBasicBlock *> &Exits) const { |
| const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
| |
| const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); |
| if (!IStart) |
| return; |
| |
| const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
| MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); |
| MachineBasicBlock::iterator MBBI = Entry->begin(); |
| for (const MCPhysReg *I = IStart; *I; ++I) { |
| const TargetRegisterClass *RC = nullptr; |
| if (AMDGPU::SReg_64RegClass.contains(*I)) |
| RC = &AMDGPU::SGPR_64RegClass; |
| else if (AMDGPU::SReg_32RegClass.contains(*I)) |
| RC = &AMDGPU::SGPR_32RegClass; |
| else |
| llvm_unreachable("Unexpected register class in CSRsViaCopy!"); |
| |
| Register NewVR = MRI->createVirtualRegister(RC); |
| // Create copy from CSR to a virtual register. |
| Entry->addLiveIn(*I); |
| BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) |
| .addReg(*I); |
| |
| // Insert the copy-back instructions right before the terminator. |
| for (auto *Exit : Exits) |
| BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), |
| TII->get(TargetOpcode::COPY), *I) |
| .addReg(NewVR); |
| } |
| } |
| |
| SDValue SITargetLowering::LowerFormalArguments( |
| SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
| const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const Function &Fn = MF.getFunction(); |
| FunctionType *FType = MF.getFunction().getFunctionType(); |
| SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { |
| DiagnosticInfoUnsupported NoGraphicsHSA( |
| Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); |
| DAG.getContext()->diagnose(NoGraphicsHSA); |
| return DAG.getEntryNode(); |
| } |
| |
| Info->allocateModuleLDSGlobal(Fn.getParent()); |
| |
| SmallVector<ISD::InputArg, 16> Splits; |
| SmallVector<CCValAssign, 16> ArgLocs; |
| BitVector Skipped(Ins.size()); |
| CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, |
| *DAG.getContext()); |
| |
| bool IsGraphics = AMDGPU::isGraphics(CallConv); |
| bool IsKernel = AMDGPU::isKernel(CallConv); |
| bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); |
| |
| if (IsGraphics) { |
| assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && |
| (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && |
| !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && |
| !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && |
| !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && |
| !Info->hasWorkItemIDZ()); |
| } |
| |
| if (CallConv == CallingConv::AMDGPU_PS) { |
| processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); |
| |
| // At least one interpolation mode must be enabled or else the GPU will |
| // hang. |
| // |
| // Check PSInputAddr instead of PSInputEnable. The idea is that if the user |
| // set PSInputAddr, the user wants to enable some bits after the compilation |
| // based on run-time states. Since we can't know what the final PSInputEna |
| // will look like, so we shouldn't do anything here and the user should take |
| // responsibility for the correct programming. |
| // |
| // Otherwise, the following restrictions apply: |
| // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. |
| // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be |
| // enabled too. |
| if ((Info->getPSInputAddr() & 0x7F) == 0 || |
| ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { |
| CCInfo.AllocateReg(AMDGPU::VGPR0); |
| CCInfo.AllocateReg(AMDGPU::VGPR1); |
| Info->markPSInputAllocated(0); |
| Info->markPSInputEnabled(0); |
| } |
| if (Subtarget->isAmdPalOS()) { |
| // For isAmdPalOS, the user does not enable some bits after compilation |
| // based on run-time states; the register values being generated here are |
| // the final ones set in hardware. Therefore we need to apply the |
| // workaround to PSInputAddr and PSInputEnable together. (The case where |
| // a bit is set in PSInputAddr but not PSInputEnable is where the |
| // frontend set up an input arg for a particular interpolation mode, but |
| // nothing uses that input arg. Really we should have an earlier pass |
| // that removes such an arg.) |
| unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); |
| if ((PsInputBits & 0x7F) == 0 || |
| ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) |
| Info->markPSInputEnabled( |
| countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); |
| } |
| } else if (IsKernel) { |
| assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); |
| } else { |
| Splits.append(Ins.begin(), Ins.end()); |
| } |
| |
| if (IsEntryFunc) { |
| allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); |
| allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); |
| } else { |
| // For the fixed ABI, pass workitem IDs in the last argument register. |
| if (AMDGPUTargetMachine::EnableFixedFunctionABI) |
| allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); |
| } |
| |
| if (IsKernel) { |
| analyzeFormalArgumentsCompute(CCInfo, Ins); |
| } else { |
| CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); |
| CCInfo.AnalyzeFormalArguments(Splits, AssignFn); |
| } |
| |
| SmallVector<SDValue, 16> Chains; |
| |
| // FIXME: This is the minimum kernel argument alignment. We should improve |
| // this to the maximum alignment of the arguments. |
| // |
| // FIXME: Alignment of explicit arguments totally broken with non-0 explicit |
| // kern arg offset. |
| const Align KernelArgBaseAlign = Align(16); |
| |
| for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { |
| const ISD::InputArg &Arg = Ins[i]; |
| if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { |
| InVals.push_back(DAG.getUNDEF(Arg.VT)); |
| continue; |
| } |
| |
| CCValAssign &VA = ArgLocs[ArgIdx++]; |
| MVT VT = VA.getLocVT(); |
| |
| if (IsEntryFunc && VA.isMemLoc()) { |
| VT = Ins[i].VT; |
| EVT MemVT = VA.getLocVT(); |
| |
| const uint64_t Offset = VA.getLocMemOffset(); |
| Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); |
| |
| if (Arg.Flags.isByRef()) { |
| SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); |
| |
| const GCNTargetMachine &TM = |
| static_cast<const GCNTargetMachine &>(getTargetMachine()); |
| if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, |
| Arg.Flags.getPointerAddrSpace())) { |
| Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, |
| Arg.Flags.getPointerAddrSpace()); |
| } |
| |
| InVals.push_back(Ptr); |
| continue; |
| } |
| |
| SDValue Arg = lowerKernargMemParameter( |
| DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); |
| Chains.push_back(Arg.getValue(1)); |
| |
| auto *ParamTy = |
| dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); |
| if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && |
| ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || |
| ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { |
| // On SI local pointers are just offsets into LDS, so they are always |
| // less than 16-bits. On CI and newer they could potentially be |
| // real pointers, so we can't guarantee their size. |
| Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, |
| DAG.getValueType(MVT::i16)); |
| } |
| |
| InVals.push_back(Arg); |
| continue; |
| } else if (!IsEntryFunc && VA.isMemLoc()) { |
| SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); |
| InVals.push_back(Val); |
| if (!Arg.Flags.isByVal()) |
| Chains.push_back(Val.getValue(1)); |
| continue; |
| } |
| |
| assert(VA.isRegLoc() && "Parameter must be in a register!"); |
| |
| Register Reg = VA.getLocReg(); |
| const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); |
| EVT ValVT = VA.getValVT(); |
| |
| Reg = MF.addLiveIn(Reg, RC); |
| SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); |
| |
| if (Arg.Flags.isSRet()) { |
| // The return object should be reasonably addressable. |
| |
| // FIXME: This helps when the return is a real sret. If it is a |
| // automatically inserted sret (i.e. CanLowerReturn returns false), an |
| // extra copy is inserted in SelectionDAGBuilder which obscures this. |
| unsigned NumBits |
| = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); |
| Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, |
| DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); |
| } |
| |
| // If this is an 8 or 16-bit value, it is really passed promoted |
| // to 32 bits. Insert an assert[sz]ext to capture this, then |
| // truncate to the right size. |
| switch (VA.getLocInfo()) { |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::BCvt: |
| Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); |
| break; |
| case CCValAssign::SExt: |
| Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, |
| DAG.getValueType(ValVT)); |
| Val = DAG.getNode( |