blob: 56ebf9c06741a4916ea664e0c96e955dba670a7f [file] [log] [blame]
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Custom DAG lowering for SI
//
//===----------------------------------------------------------------------===//
#if defined(_MSC_VER) || defined(__MINGW32__)
// Provide M_PI.
#define _USE_MATH_DEFINES
#endif
#include "SIISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "si-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<bool> EnableVGPRIndexMode(
"amdgpu-vgpr-index-mode",
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
static cl::opt<bool> DisableLoopAlignment(
"amdgpu-disable-loop-alignment",
cl::desc("Do not align and prefetch loops"),
cl::init(false));
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
return AMDGPU::SGPR0 + Reg;
}
}
llvm_unreachable("Cannot allocate sgpr");
}
SITargetLowering::SITargetLowering(const TargetMachine &TM,
const GCNSubtarget &STI)
: AMDGPUTargetLowering(TM, STI),
Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
if (Subtarget->hasMAIInsts()) {
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
}
computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::v32i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Promote);
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i64, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setOperationAction(ISD::UADDO, MVT::i32, Legal);
setOperationAction(ISD::USUBO, MVT::i32, Legal);
setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
#if 0
setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
#endif
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
case ISD::SCALAR_TO_VECTOR:
break;
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
break;
default:
setOperationAction(Op, VT, Expand);
break;
}
}
}
setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
// is expanded to avoid having two separate loops in case the index is a VGPR.
// Most operations are naturally 32-bit vector operations. We only support
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
// Deal with vec3 vector operations when widened to vec4.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
// Deal with vec5 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
// We can't return success/failure, only the old value,
// let LLVM add the comparison
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
if (Subtarget->hasFlatAddressSpace()) {
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
// On SI this is s_memtime and s_memrealtime on VI.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
setOperationAction(ISD::TRAP, MVT::Other, Custom);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FLOG, MVT::f16, Custom);
setOperationAction(ISD::FEXP, MVT::f16, Custom);
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
}
// v_mad_f32 does not support denormals according to some sources.
if (!Subtarget->hasFP32Denormals())
setOperationAction(ISD::FMAD, MVT::f32, Legal);
if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
}
if (!Subtarget->hasBCNT(32))
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
if (Subtarget->hasFFBH())
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
if (Subtarget->hasFFBL())
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
// On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
// effort to match them now. We want this to be false for i64 cases when the
// extraction isn't restricted to the upper or lower half. Ideally we would
// have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
// span the midpoint are probably relatively rare, so don't worry about them
// for now.
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);
setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
// These are really only legal for ieee_mode functions. We should be avoiding
// them for functions that don't have ieee_mode enabled, so just say they are
// legal.
setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
if (Subtarget->haveRoundOpsF64()) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
} else {
setOperationAction(ISD::FCEIL, MVT::f64, Custom);
setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
setOperationAction(ISD::FRINT, MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
}
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FSIN, MVT::f32, Custom);
setOperationAction(ISD::FCOS, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::Constant, MVT::i16, Legal);
setOperationAction(ISD::SMIN, MVT::i16, Legal);
setOperationAction(ISD::SMAX, MVT::i16, Legal);
setOperationAction(ISD::UMIN, MVT::i16, Legal);
setOperationAction(ISD::UMAX, MVT::i16, Legal);
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
setOperationAction(ISD::ROTR, MVT::i16, Promote);
setOperationAction(ISD::ROTL, MVT::i16, Promote);
setOperationAction(ISD::SDIV, MVT::i16, Promote);
setOperationAction(ISD::UDIV, MVT::i16, Promote);
setOperationAction(ISD::SREM, MVT::i16, Promote);
setOperationAction(ISD::UREM, MVT::i16, Promote);
setOperationAction(ISD::BSWAP, MVT::i16, Promote);
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
setOperationAction(ISD::CTTZ, MVT::i16, Promote);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTPOP, MVT::i16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
setOperationAction(ISD::BR_CC, MVT::i16, Expand);
setOperationAction(ISD::LOAD, MVT::i16, Custom);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
// F16 - Load/Store Actions.
setOperationAction(ISD::LOAD, MVT::f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
setOperationAction(ISD::STORE, MVT::f16, Promote);
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
// F16 - VOP1 Actions.
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Custom);
// F16 - VOP2 Actions.
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
case ISD::SCALAR_TO_VECTOR:
break;
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
break;
default:
setOperationAction(Op, VT, Expand);
break;
}
}
}
// XXX - Do these do anything? Vector constants turn into build_vector.
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
setOperationAction(ISD::AND, MVT::v2i16, Promote);
AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
setOperationAction(ISD::OR, MVT::v2i16, Promote);
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v4i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v4f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
if (!Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
}
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
}
if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
setOperationAction(ISD::SHL, MVT::v2i16, Legal);
setOperationAction(ISD::SRL, MVT::v2i16, Legal);
setOperationAction(ISD::SRA, MVT::v2i16, Legal);
setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
setOperationAction(ISD::SHL, MVT::v4i16, Custom);
setOperationAction(ISD::SRA, MVT::v4i16, Custom);
setOperationAction(ISD::SRL, MVT::v4i16, Custom);
setOperationAction(ISD::ADD, MVT::v4i16, Custom);
setOperationAction(ISD::SUB, MVT::v4i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i16, Custom);
setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
}
setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
setOperationAction(ISD::FABS, MVT::v4f16, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
// Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
setOperationAction(ISD::SELECT, VT, Custom);
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::ADDCARRY);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::SUBCARRY);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::FMINNUM_IEEE);
setTargetDAGCombine(ISD::FMAXNUM_IEEE);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::UMAX);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ATOMIC_LOAD);
setTargetDAGCombine(ISD::ATOMIC_STORE);
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
setTargetDAGCombine(ISD::ATOMIC_SWAP);
setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
setSchedulingPreference(Sched::RegPressure);
}
const GCNSubtarget *SITargetLowering::getSubtarget() const {
return Subtarget;
}
//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//
// v_mad_mix* support a conversion from f16 to f32.
//
// There is only one special case when denormals are enabled we don't currently,
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
SrcVT.getScalarType() == MVT::f16;
}
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
return false;
}
MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC == CallingConv::AMDGPU_KERNEL)
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
return ScalarVT.getSimpleVT();
if (Size > 32)
return MVT::i32;
if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
} else if (VT.getSizeInBits() > 32)
return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC == CallingConv::AMDGPU_KERNEL)
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
return NumElts;
if (Size > 32)
return NumElts * ((Size + 31) / 32);
if (Size == 16 && Subtarget->has16BitInsts())
return (NumElts + 1) / 2;
} else if (VT.getSizeInBits() > 32)
return (VT.getSizeInBits() + 31) / 32;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC,
EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
NumIntermediates = NumElts;
return NumIntermediates;
}
if (Size > 32) {
RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
if (Size == 16 && Subtarget->has16BitInsts()) {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
IntermediateVT = RegisterVT;
NumIntermediates = (NumElts + 1) / 2;
return NumIntermediates;
}
}
return TargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
static MVT memVTFromAggregate(Type *Ty) {
// Only limited forms of aggregate type currently expected.
assert(Ty->isStructTy() && "Expected struct type");
Type *ElementType = nullptr;
unsigned NumElts;
if (Ty->getContainedType(0)->isVectorTy()) {
VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
ElementType = VecComponent->getElementType();
NumElts = VecComponent->getNumElements();
} else {
ElementType = Ty->getContainedType(0);
NumElts = 1;
}
assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
// Calculate the size of the memVT type from the aggregate
unsigned Pow2Elts = 0;
unsigned ElementSize;
switch (ElementType->getTypeID()) {
default:
llvm_unreachable("Unknown type!");
case Type::IntegerTyID:
ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
break;
case Type::HalfTyID:
ElementSize = 16;
break;
case Type::FloatTyID:
ElementSize = 32;
break;
}
unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
return MVT::getVectorVT(MVT::getVT(ElementType, false),
Pow2Elts);
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
(Intrinsic::ID)IntrID);
if (Attr.hasFnAttribute(Attribute::ReadNone))
return false;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (RsrcIntr->IsImage) {
Info.ptrVal = MFI->getImagePSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(RsrcIntr->RsrcArg));
Info.align.reset();
} else {
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(RsrcIntr->RsrcArg));
}
Info.flags = MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType(), true);
if (Info.memVT == MVT::Other) {
// Some intrinsics return an aggregate type - special case to work out
// the correct memVT
Info.memVT = memVTFromAggregate(CI.getType());
}
Info.flags |= MachineMemOperand::MOLoad;
} else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
Info.flags |= MachineMemOperand::MOStore;
} else {
// Atomic
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
}
return true;
}
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
if (!Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(1));
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol || !Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
->getPointerElementType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
if (!Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
case Intrinsic::amdgcn_ds_gws_sema_br:
case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
Info.opc = ISD::INTRINSIC_VOID;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.ptrVal =
MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
Info.align = Align(4);
Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
Info.flags = MachineMemOperand::MOLoad;
return true;
}
default:
return false;
}
}
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
return true;
}
default:
return false;
}
}
bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
if (!Subtarget->hasFlatInstOffsets()) {
// Flat instructions do not have offsets, and only have the register
// address.
return AM.BaseOffs == 0 && AM.Scale == 0;
}
// GFX9 added a 13-bit signed offset. When using regular flat instructions,
// the sign bit is ignored and is treated as a 12-bit unsigned offset.
// GFX10 shrinked signed offset to 12 bits. When using regular flat
// instructions, the sign bit is also ignored and is treated as 11-bit
// unsigned offset.
if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
// Just r + i
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
if (Subtarget->hasFlatGlobalInsts())
return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
// FIXME: This assumption is currently wrong. On VI we still use
// MUBUF instructions for the r + i addressing mode. As currently
// implemented, the MUBUF instructions only work on buffer < 4GB.
// It may be possible to support > 4GB buffers with MUBUF instructions,
// by setting the stride value in the resource descriptor which would
// increase the size limit to (stride * 4GB). However, this is risky,
// because it has never been validated.
return isLegalFlatAddressingMode(AM);
}
return isLegalMUBUFAddressingMode(AM);
}
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
// mode options. Depending on the resource constant, it can also do
// (i64 r0) + (i32 r1) * (i14 i).
//
// Private arrays end up using a scratch buffer most of the time, so also
// assume those use MUBUF instructions. Scratch loads / stores are currently
// implemented as mubuf instructions with offen bit set, so slightly
// different than the normal addr64.
if (!isUInt<12>(AM.BaseOffs))
return false;
// FIXME: Since we can split immediate into soffset and immediate offset,
// would it make sense to allow any immediate?
switch (AM.Scale) {
case 0: // r + i or just i, depending on HasBaseReg.
return true;
case 1:
return true; // We have r + r or r + i.
case 2:
if (AM.HasBaseReg) {
// Reject 2 * r + r.
return false;
}
// Allow 2 * r as r + r
// Or 2 * r + i is allowed as r + r + i.
return true;
default: // Don't allow n * r
return false;
}
}
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
if (AS == AMDGPUAS::GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
if (AM.BaseOffs % 4 != 0)
return isLegalMUBUFAddressingMode(AM);
// There are no SMRD extloads, so if we have to do a small type access we
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
if (!isUInt<8>(AM.BaseOffs / 4))
return false;
} else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
// On CI+, this can also be a 32-bit literal constant offset. If it fits
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
} else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
} else
llvm_unreachable("unhandled generation");
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
if (AM.Scale == 1 && AM.HasBaseReg)
return true;
return false;
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
return isLegalMUBUFAddressingMode(AM);
} else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
AS == AMDGPUAS::REGION_ADDRESS) {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
// field.
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
// an 8-bit dword offset but we don't know the alignment here.
if (!isUInt<16>(AM.BaseOffs))
return false;
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
if (AM.Scale == 1 && AM.HasBaseReg)
return true;
return false;
} else if (AS == AMDGPUAS::FLAT_ADDRESS ||
AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
// For an unknown address space, this usually means that this is for some
// reason being used for pure arithmetic, and not based on some addressing
// computation. We don't have instructions that compute pointers with any
// addressing modes, so treat them as having no offset like flat
// instructions.
return isLegalFlatAddressingMode(AM);
} else {
llvm_unreachable("unhandled address space");
}
}
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const {
if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
return (MemVT.getSizeInBits() <= 4 * 32);
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
return (MemVT.getSizeInBits() <= MaxPrivateBits);
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
return (MemVT.getSizeInBits() <= 2 * 32);
}
return true;
}
bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
unsigned Size, unsigned AddrSpace, unsigned Align,
MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
bool AlignedBy4 = (Align % 4 == 0);
if (IsFast)
*IsFast = AlignedBy4;
return AlignedBy4;
}
// FIXME: We have to be conservative here and assume that flat operations
// will access scratch. If we had access to the IR function, then we
// could determine if any private memory was used in the function.
if (!Subtarget->hasUnalignedScratchAccess() &&
(AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
bool AlignedBy4 = Align >= 4;
if (IsFast)
*IsFast = AlignedBy4;
return AlignedBy4;
}
if (Subtarget->hasUnalignedBufferAccess()) {
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
return true;
}
// Smaller than dword value must be aligned.
if (Size < 32)
return false;
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
// byte-address are ignored, thus forcing Dword alignment.
// This applies to private, global, and constant memory.
if (IsFast)
*IsFast = true;
return Size >= 32 && Align >= 4;
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
bool *IsFast) const {
if (IsFast)
*IsFast = false;
// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
// which isn't a simple VT.
// Until MVT is extended to handle this, simply check for the size and
// rely on the condition below: allow accesses if the size is a multiple of 4.
if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
VT.getStoreSize() > 16)) {
return false;
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
Align, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
// FIXME: Should account for address space here.
// The default fallback uses the private pointer size as a guess for a type to
// use. Make sure we switch these to 64-bit accesses.
if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
return MVT::v4i32;
if (Size >= 8 && DstAlign >= 4)
return MVT::v2i32;
// Use the default.
return MVT::Other;
}
static bool isFlatGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
AS == AMDGPUAS::FLAT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.noclobber");
}
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
return true;
return isNoopAddrSpaceCast(SrcAS, DestAS);
}
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
int NumElts = VT.getVectorNumElements();
if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
// FIXME: Could be smarter if called for vector constants.
return true;
}
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
// These operations are done with 32-bit instructions anyway.
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::SELECT:
// TODO: Extensions?
return true;
default:
return false;
}
}
// SimplifySetCC uses this function to determine whether or not it should
// create setcc with i1 operands. We don't have instructions for i1 setcc.
if (VT == MVT::i1 && Op == ISD::SETCC)
return false;
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
uint64_t Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const ArgDescriptor *InputPtrReg;
const TargetRegisterClass *RC;
std::tie(InputPtrReg, RC)
= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
const SDLoc &SL) const {
uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
FIRST_IMPLICIT);
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
}
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
const ISD::InputArg *Arg) const {
// First, if it is a widened vector, narrow it.
if (VT.isVector() &&
VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
EVT NarrowedVT =
EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
VT.getVectorNumElements());
Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
DAG.getConstant(0, SL, MVT::i32));
}
// Then convert the vector elements or scalar value.
if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
VT.bitsLT(MemVT)) {
unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
}
if (MemVT.isFloatingPoint())
Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
else if (Signed)
Val = DAG.getSExtOrTrunc(Val, SL, VT);
else
Val = DAG.getZExtOrTrunc(Val, SL, VT);
return Val;
}
SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
// the previous argument.
if (MemVT.getStoreSize() < 4 && Align < 4) {
// TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
int64_t AlignDownOffset = alignDown(Offset, 4);
int64_t OffsetDiff = Offset - AlignDownOffset;
EVT IntVT = MemVT.changeTypeToInteger();
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
}
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
if (Arg.Flags.isByVal()) {
unsigned Size = Arg.Flags.getByValSize();
int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
return DAG.getFrameIndex(FrameIdx, MVT::i32);
}
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = VA.getValVT().getStoreSize();
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
MVT MemVT = VA.getValVT();
switch (VA.getLocInfo()) {
default:
break;
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
case CCValAssign::ZExt:
ExtType = ISD::ZEXTLOAD;
break;
case CCValAssign::AExt:
ExtType = ISD::EXTLOAD;
break;
}
ArgValue = DAG.getExtLoad(
ExtType, SL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
return ArgValue;
}
SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
const ArgDescriptor *Reg;
const TargetRegisterClass *RC;
std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
BitVector &Skipped,
FunctionType *FType,
SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
"vector type argument should have been split");
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
!Arg->Flags.isInReg() && PSInputNum <= 15) {
bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
// Inconveniently only the first part of the split is marked as isSplit,
// so skip to the end. We only want to increment PSInputNum once for the
// entire split argument.
if (Arg->Flags.isSplit()) {
while (!Arg->Flags.isSplitEnd()) {
assert((!Arg->VT.isVector() ||
Arg->VT.getScalarSizeInBits() == 16) &&
"unexpected vector split in ps argument type");
if (!SkipArg)
Splits.push_back(*Arg);
Arg = &Ins[++I];
}
}
if (SkipArg) {
// We can safely skip PS inputs.
Skipped.set(Arg->getOrigArgIndex());
++PSInputNum;
continue;
}
Info->markPSInputAllocated(PSInputNum);
if (Arg->Used)
Info->markPSInputEnabled(PSInputNum);
++PSInputNum;
}
Splits.push_back(*Arg);
}
}
// Allocate special inputs passed in VGPRs.
void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Info.hasWorkItemIDX()) {
Register Reg = AMDGPU::VGPR0;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
Register Reg = AMDGPU::VGPR1;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
Register Reg = AMDGPU::VGPR2;
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
}
}
// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
// If \p Mask is is given it indicates bitfield position in the register.
// If \p Arg is given use it with new ]p Mask instead of allocating new.
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
ArgDescriptor Arg = ArgDescriptor()) {
if (Arg.isSet())
return ArgDescriptor::createArg(Arg, Mask);
ArrayRef<MCPhysReg> ArgVGPRs
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
if (RegIdx == ArgVGPRs.size()) {
// Spill to stack required.
int64_t Offset = CCInfo.AllocateStack(4, 4);
return ArgDescriptor::createStack(Offset, Mask);
}
unsigned Reg = ArgVGPRs[RegIdx];
Reg = CCInfo.AllocateReg(Reg);
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
return ArgDescriptor::createRegister(Reg, Mask);
}
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
const TargetRegisterClass *RC,
unsigned NumArgRegs) {
ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
if (RegIdx == ArgSGPRs.size())
report_fatal_error("ran out of SGPRs for arguments");
unsigned Reg = ArgSGPRs[RegIdx];
Reg = CCInfo.AllocateReg(Reg);
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, RC);
return ArgDescriptor::createRegister(Reg);
}
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
}
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
const unsigned Mask = 0x3ff;
ArgDescriptor Arg;
if (Info.hasWorkItemIDX()) {
Arg = allocateVGPR32Input(CCInfo, Mask);
Info.setWorkItemIDX(Arg);
}
if (Info.hasWorkItemIDY()) {
Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
Info.setWorkItemIDY(Arg);
}
if (Info.hasWorkItemIDZ())
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
void SITargetLowering::allocateSpecialInputSGPRs(
CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
// TODO: Unify handling with private memory pointers.
if (Info.hasDispatchPtr())
ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
if (Info.hasQueuePtr())
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
if (Info.hasKernargSegmentPtr())
ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
if (Info.hasDispatchID())
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
// flat_scratch_init is not applicable for non-kernel functions.
if (Info.hasWorkGroupIDX())
ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
if (Info.hasWorkGroupIDY())
ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
if (Info.hasWorkGroupIDZ())
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
if (Info.hasImplicitArgPtr())
ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
}
// Allocate special inputs passed in user SGPRs.
void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
if (Info.hasImplicitBufferPtr()) {
unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info.hasDispatchPtr()) {
unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info.hasQueuePtr()) {
unsigned QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
if (Info.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
CCInfo.AllocateReg(InputPtrReg);
Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
}
if (Info.hasDispatchID()) {
unsigned DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
if (Info.hasFlatScratchInit()) {
unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
}
// Allocate special input registers that are initialized per-wave.
void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
MachineFunction &MF,
SIMachineFunctionInfo &Info,
CallingConv::ID CallConv,
bool IsShader) const {
if (Info.hasWorkGroupIDX()) {
unsigned Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
unsigned Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
unsigned Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupInfo()) {
unsigned Reg = Info.addWorkGroupInfo();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasPrivateSegmentWaveByteOffset()) {
// Scratch wave offset passed in system SGPR.
unsigned PrivateSegmentWaveByteOffsetReg;
if (IsShader) {
PrivateSegmentWaveByteOffsetReg =
Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
// This is true if the scratch wave byte offset doesn't have a fixed
// location.
if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
}
} else
PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
}
static void reservePrivateMemoryRegs(const TargetMachine &TM,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
MachineFrameInfo &MFI = MF.getFrameInfo();
bool HasStackObjects = MFI.hasStackObjects();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
if (HasStackObjects)
Info.setHasNonSpillStackObjects(true);
// Everything live out of a block is spilled with fast regalloc, so it's
// almost certain that spilling will be required.
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
// For now assume stack access is needed in any callee functions, so we need
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
Register PrivateSegmentBufferReg =
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
// We tentatively reserve the last registers (skipping the last registers
// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
// we'll replace these with the ones immediately after those which were
// really allocated. In the prologue copies will be inserted from the
// argument to these reserved registers.
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
}
// hasFP should be accurate for kernels even before the frame is finalized.
if (ST.getFrameLowering()->hasFP(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();
// Try to use s32 as the SP, but move it if it would interfere with input
// arguments. This won't work with calls though.
//
// FIXME: Move SP to avoid any possible inputs, or find a way to spill input
// registers.
if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
} else {
assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
if (MFI.hasCalls())
report_fatal_error("call in graphics shader with too many input SGPRs");
for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
if (!MRI.isLiveIn(Reg)) {
Info.setStackPtrOffsetReg(Reg);
break;
}
}
if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
report_fatal_error("failed to find register for SP");
}
if (MFI.hasCalls()) {
Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
Info.setFrameOffsetReg(AMDGPU::SGPR33);
} else {
unsigned ReservedOffsetReg =
TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
Info.setFrameOffsetReg(ReservedOffsetReg);
}
} else if (RequiresStackAccess) {
assert(!MFI.hasCalls());
// We know there are accesses and they will be done relative to SP, so just
// pin it to the input.
//
// FIXME: Should not do this if inline asm is reading/writing these
// registers.
Register PreloadedSP = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setStackPtrOffsetReg(PreloadedSP);
Info.setScratchWaveOffsetReg(PreloadedSP);
Info.setFrameOffsetReg(PreloadedSP);
} else {
assert(!MFI.hasCalls());
// There may not be stack access at all. There may still be spills, or
// access of a constant pointer (in which cases an extra copy will be
// emitted in the prolog).
unsigned ReservedOffsetReg
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setStackPtrOffsetReg(ReservedOffsetReg);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
Info.setFrameOffsetReg(ReservedOffsetReg);
}
}
bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
return !Info->isEntryFunction();
}
void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
}
void SITargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AMDGPU::SReg_64RegClass.contains(*I))
RC = &AMDGPU::SGPR_64RegClass;
else if (AMDGPU::SReg_32RegClass.contains(*I))
RC = &AMDGPU::SGPR_32RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
return DAG.getEntryNode();
}
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
bool IsShader = AMDGPU::isShader(CallConv);
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
// At least one interpolation mode must be enabled or else the GPU will
// hang.
//
// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
// set PSInputAddr, the user wants to enable some bits after the compilation
// based on run-time states. Since we can't know what the final PSInputEna
// will look like, so we shouldn't do anything here and the user should take
// responsibility for the correct programming.
//
// Otherwise, the following restrictions apply:
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
if (CallConv == CallingConv::AMDGPU_PS) {
if ((Info->getPSInputAddr() & 0x7F) == 0 ||
((Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
CCInfo.AllocateReg(AMDGPU::VGPR0);
CCInfo.AllocateReg(AMDGPU::VGPR1);
Info->markPSInputAllocated(0);
Info->markPSInputEnabled(0);
}
if (Subtarget->isAmdPalOS()) {
// For isAmdPalOS, the user does not enable some bits after compilation
// based on run-time states; the register values being generated here are
// the final ones set in hardware. Therefore we need to apply the
// workaround to PSInputAddr and PSInputEnable together. (The case where
// a bit is set in PSInputAddr but not PSInputEnable is where the
// frontend set up an input arg for a particular interpolation mode, but
// nothing uses that input arg. Really we should have an earlier pass
// that removes such an arg.)
unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
if ((PsInputBits & 0x7F) == 0 ||
((PsInputBits & 0xF) == 0 &&
(PsInputBits >> 11 & 1)))
Info->markPSInputEnabled(
countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
}
}
assert(!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
!Info->hasWorkItemIDZ());
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
Splits.append(Ins.begin(), Ins.end());
}
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
}
if (IsKernel) {
analyzeFormalArgumentsCompute(CCInfo, Ins);
} else {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
}
SmallVector<SDValue, 16> Chains;
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
// kern arg offset.
const unsigned KernelArgBaseAlign = 16;
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
continue;
}
CCValAssign &VA = ArgLocs[ArgIdx++];
MVT VT = VA.getLocVT();
if (IsEntryFunc && VA.isMemLoc()) {
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
const uint64_t Offset = VA.getLocMemOffset();
unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
SDValue Arg = lowerKernargMemParameter(
DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.
Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
DAG.getValueType(MVT::i16));
}
InVals.push_back(Arg);
continue;
} else if (!IsEntryFunc && VA.isMemLoc()) {
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
InVals.push_back(Val);
if (!Arg.Flags.isByVal())
Chains.push_back(Val.getValue(1));
continue;
}
assert(VA.isRegLoc() && "Parameter must be in a register!");
Register Reg = VA.getLocReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
// FIXME: This helps when the return is a real sret. If it is a
// automatically inserted sret (i.e. CanLowerReturn returns false), an
// extra copy is inserted in SelectionDAGBuilder which obscures this.
unsigned NumBits
= 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
}
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
break;
case CCValAssign::SExt:
Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
DAG.getValueType(ValVT));
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
break;
case CCValAssign::ZExt:
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
DAG.getValueType(ValVT));
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
break;
case CCValAssign::AExt:
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
break;
default:
llvm_unreachable("Unknown loc info!");
}
InVals.push_back(Val);
}
if (!IsEntryFunc) {
// Special inputs come after user arguments.
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
}
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
unsigned StackArgSize = CCInfo.getNextStackOffset();
Info->setBytesInStackArgArea(StackArgSize);
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
// TODO: If return values can't fit in registers, we should return as many as
// possible in registers before passing on stack.
bool SITargetLowering::CanLowerReturn(
CallingConv::ID CallConv,
MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
// Replacing returns with sret/stack usage doesn't make sense for shaders.
// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
// for shaders. Vector types should be explicitly handled by CC.
if (AMDGPU::isEntryFunctionCC(CallConv))
return true;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
}
SDValue
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
if (AMDGPU::isKernel(CallConv)) {
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
OutVals, DL, DAG);
}
bool IsShader = AMDGPU::isShader(CallConv);
Info->setIfReturnsVoid(Outs.empty());
bool IsWaveEnd = Info->returnsVoid() && IsShader;
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 48> RVLocs;
SmallVector<ISD::OutputArg, 48> Splits;
// CCState - Info about the registers and stack slots.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze outgoing return values.
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Add return address for callable functions.
if (!Info->isEntryFunction()) {
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
SDValue ReturnAddrReg = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
SDValue ReturnAddrVirtualReg = DAG.getRegister(
MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
MVT::i64);
Chain =
DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(ReturnAddrVirtualReg);
}
// Copy the result values into the output registers.
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// TODO: Partially return in registers if return values don't fit.
SDValue Arg = OutVals[RealRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
default:
llvm_unreachable("Unknown loc info!");
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
// FIXME: Does sret work properly?
if (!Info->isEntryFunction()) {
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (AMDGPU::SReg_64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AMDGPU::SReg_32RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i32));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
}
// Update chain and glue.
RetOps[0] = Chain;
if (Flag.getNode())
RetOps.push_back(Flag);
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
SDValue SITargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
SDValue Val;
if (VA.isRegLoc()) {
Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
} else if (VA.isMemLoc()) {
report_fatal_error("TODO: return values in memory");
} else
llvm_unreachable("unknown argument location type");
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
case CCValAssign::ZExt:
Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
DAG.getValueType(VA.getValVT()));
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
case CCValAssign::SExt:
Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
DAG.getValueType(VA.getValVT()));
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
case CCValAssign::AExt:
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
default:
llvm_unreachable("Unknown loc info!");
}
InVals.push_back(Val);
}
return Chain;
}
// Add code to pass special inputs required depending on used features separate
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
return;
const Function *CalleeFunc = CLI.CS.getCalledFunction();
assert(CalleeFunc);
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
const AMDGPUFunctionArgInfo &CalleeArgInfo
= ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
// TODO: Unify with private memory register handling. This is complicated by
// the fact that at least in kernels, the input argument is not necessarily
// in the same location as the input.
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
AMDGPUFunctionArgInfo::DISPATCH_PTR,
AMDGPUFunctionArgInfo::QUEUE_PTR,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
};
for (auto InputID : InputRegs) {
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
if (!OutgoingArg)
continue;
const ArgDescriptor *IncomingArg;
const TargetRegisterClass *IncomingArgRC;
std::tie(IncomingArg, IncomingArgRC)
= CallerArgInfo.getPreloadedValue(InputID);
assert(IncomingArgRC == ArgRC);
// All special arguments are ints for now.
EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
SDValue InputReg;
if (IncomingArg) {
InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
} else {
// The implicit arg ptr is special because it doesn't have a corresponding
// input for kernels, and is computed from the kernarg segment pointer.
assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
InputReg = getImplicitArgPtr(DAG, DL);
}
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
SpecialArgOffset);
MemOpChains.push_back(ArgStore);
}
}
// Pack workitem IDs into a single register or pass it as is if already
// packed.
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
std::tie(OutgoingArg, ArgRC) =
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
if (!OutgoingArg)
std::tie(OutgoingArg, ArgRC) =
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
if (!OutgoingArg)
std::tie(OutgoingArg, ArgRC) =
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
if (!OutgoingArg)
return;
const ArgDescriptor *IncomingArgX
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
const ArgDescriptor *IncomingArgY
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
const ArgDescriptor *IncomingArgZ
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
SDValue InputReg;
SDLoc SL;
// If incoming ids are not packed we need to pack them.
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
DAG.getShiftAmountConstant(10, MVT::i32, SL));
InputReg = InputReg.getNode() ?
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
}
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
DAG.getShiftAmountConstant(20, MVT::i32, SL));
InputReg = InputReg.getNode() ?
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
}
if (!InputReg.getNode()) {
// Workitem ids are already packed, any of present incoming arguments
// will carry all required fields.
ArgDescriptor IncomingArg = ArgDescriptor::createArg(
IncomingArgX ? *IncomingArgX :
IncomingArgY ? *IncomingArgY :
*IncomingArgZ, ~0u);
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
}
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
</