blob: 1f898f2ba8b3eb199a46b2ae001c38c30b7a95ee [file] [log] [blame]
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements the targeting of the Machinelegalizer class for
/// AMDGPU.
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
#include "AMDGPULegalizerInfo.h"
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
using namespace llvm;
using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;
using namespace MIPatternMatch;
// Hack until load/store selection patterns support any tuple of legal types.
static cl::opt<bool> EnableNewLegality(
"amdgpu-global-isel-new-legality",
cl::desc("Use GlobalISel desired legality, rather than try to use"
"rules compatible with selection patterns"),
cl::init(false),
cl::ReallyHidden);
static constexpr unsigned MaxRegisterSize = 1024;
// Round the number of elements to the next power of two elements
static LLT getPow2VectorType(LLT Ty) {
unsigned NElts = Ty.getNumElements();
unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
}
// Round the number of bits to the next power of two bits
static LLT getPow2ScalarType(LLT Ty) {
unsigned Bits = Ty.getSizeInBits();
unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
return LLT::scalar(Pow2Bits);
}
/// \returns true if this is an odd sized vector which should widen by adding an
/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
/// excludes s1 vectors, which should always be scalarized.
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
if (!Ty.isVector())
return false;
const LLT EltTy = Ty.getElementType();
const unsigned EltSize = EltTy.getSizeInBits();
return Ty.getNumElements() % 2 != 0 &&
EltSize > 1 && EltSize < 32 &&
Ty.getSizeInBits() % 32 != 0;
};
}
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return Ty.getSizeInBits() % 32 == 0;
};
}
static LegalityPredicate isWideVec16(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getScalarType();
return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
};
}
static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getElementType();
return std::make_pair(TypeIdx,
LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
};
}
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getElementType();
unsigned Size = Ty.getSizeInBits();
unsigned Pieces = (Size + 63) / 64;
unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
return std::make_pair(
TypeIdx,
LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
};
}
// Increase the number of vector elements to reach the next multiple of 32-bit
// type.
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getElementType();
const int Size = Ty.getSizeInBits();
const int EltSize = EltTy.getSizeInBits();
const int NextMul32 = (Size + 31) / 32;
assert(EltSize < 32);
const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
};
}
static LLT getBitcastRegisterType(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
LLT CoercedTy;
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
return LLT::scalar(Size);
}
return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
};
}
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
unsigned Size = Ty.getSizeInBits();
assert(Size % 32 == 0);
return std::make_pair(
TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
};
}
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
};
}
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
};
}
static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
};
}
static bool isRegisterSize(unsigned Size) {
return Size % 32 == 0 && Size <= MaxRegisterSize;
}
static bool isRegisterVectorElementType(LLT EltTy) {
const int EltSize = EltTy.getSizeInBits();
return EltSize == 16 || EltSize % 32 == 0;
}
static bool isRegisterVectorType(LLT Ty) {
const int EltSize = Ty.getElementType().getSizeInBits();
return EltSize == 32 || EltSize == 64 ||
(EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
EltSize == 128 || EltSize == 256;
}
static bool isRegisterType(LLT Ty) {
if (!isRegisterSize(Ty.getSizeInBits()))
return false;
if (Ty.isVector())
return isRegisterVectorType(Ty);
return true;
}
// Any combination of 32 or 64-bit elements up the maximum register size, and
// multiples of v2s16.
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
return isRegisterType(Query.Types[TypeIdx]);
};
}
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
if (!QueryTy.isVector())
return false;
const LLT EltTy = QueryTy.getElementType();
return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
};
}
// If we have a truncating store or an extending load with a data size larger
// than 32-bits, we need to reduce to a 32-bit type.
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
};
}
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
bool IsLoad) {
switch (AS) {
case AMDGPUAS::PRIVATE_ADDRESS:
// FIXME: Private element size.
return ST.enableFlatScratch() ? 128 : 32;
case AMDGPUAS::LOCAL_ADDRESS:
return ST.useDS128() ? 128 : 64;
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
// Treat constant and global as identical. SMRD loads are sometimes usable for
// global loads (ideally constant address space should be eliminated)
// depending on the context. Legality cannot be context dependent, but
// RegBankSelect can split the load as necessary depending on the pointer
// register bank/uniformity and if the memory is invariant or not written in a
// kernel.
return IsLoad ? 512 : 128;
default:
// Flat addresses may contextually need to be split to 32-bit parts if they
// may alias scratch depending on the subtarget.
return 128;
}
}
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
unsigned RegSize = Ty.getSizeInBits();
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
// All of these need to be custom lowered to cast the pointer operand.
if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
// Do not handle extending vector loads.
if (Ty.isVector() && MemSize != RegSize)
return false;
// TODO: We should be able to widen loads if the alignment is high enough, but
// we also need to modify the memory access size.
#if 0
// Accept widening loads based on alignment.
if (IsLoad && MemSize < Size)
MemSize = std::max(MemSize, Align);
#endif
// Only 1-byte and 2-byte to 32-bit extloads are valid.
if (MemSize != RegSize && RegSize != 32)
return false;
if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
return false;
switch (MemSize) {
case 8:
case 16:
case 32:
case 64:
case 128:
break;
case 96:
if (!ST.hasDwordx3LoadStores())
return false;
break;
case 256:
case 512:
// These may contextually need to be broken down.
break;
default:
return false;
}
assert(RegSize >= MemSize);
if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
Align(AlignBits / 8)))
return false;
}
return true;
}
// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
// workaround this. Eventually it should ignore the type for loads and only care
// about the size. Return true in cases where we will workaround this for now by
// bitcasting.
static bool loadStoreBitcastWorkaround(const LLT Ty) {
if (EnableNewLegality)
return false;
const unsigned Size = Ty.getSizeInBits();
if (Size <= 64)
return false;
if (!Ty.isVector())
return true;
LLT EltTy = Ty.getElementType();
if (EltTy.isPointer())
return true;
unsigned EltSize = EltTy.getSizeInBits();
return EltSize != 32 && EltSize != 64;
}
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
!loadStoreBitcastWorkaround(Ty);
}
/// Return true if a load or store of the type should be lowered with a bitcast
/// to a different type.
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
const LLT MemTy) {
const unsigned MemSizeInBits = MemTy.getSizeInBits();
const unsigned Size = Ty.getSizeInBits();
if (Size != MemSizeInBits)
return Size <= 32 && Ty.isVector();
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
return true;
// Don't try to handle bitcasting vector ext loads for now.
return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
(Size <= 32 || isRegisterSize(Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
}
/// Return true if we should legalize a load by widening an odd sized memory
/// access up to the alignment. Note this case when the memory access itself
/// changes, not the size of the result register.
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
unsigned AlignInBits, unsigned AddrSpace,
unsigned Opcode) {
unsigned SizeInBits = MemoryTy.getSizeInBits();
// We don't want to widen cases that are naturally legal.
if (isPowerOf2_32(SizeInBits))
return false;
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
// end up widening these for a scalar load during RegBankSelect, since there
// aren't 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;
if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
return false;
// A load is known dereferenceable up to the alignment, so it's legal to widen
// to it.
//
// TODO: Could check dereferenceable for less aligned cases.
unsigned RoundedSize = NextPowerOf2(SizeInBits);
if (AlignInBits < RoundedSize)
return false;
// Do not widen if it would introduce a slow unaligned load.
const SITargetLowering *TLI = ST.getTargetLowering();
bool Fast = false;
return TLI->allowsMisalignedMemoryAccessesImpl(
RoundedSize, AddrSpace, Align(AlignInBits / 8),
MachineMemOperand::MOLoad, &Fast) &&
Fast;
}
static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
unsigned Opcode) {
if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
return false;
return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
Query.MMODescrs[0].AlignInBits,
Query.Types[1].getAddressSpace(), Opcode);
}
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
using namespace TargetOpcode;
auto GetAddrSpacePtr = [&TM](unsigned AS) {
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
};
const LLT S1 = LLT::scalar(1);
const LLT S8 = LLT::scalar(8);
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT S128 = LLT::scalar(128);
const LLT S256 = LLT::scalar(256);
const LLT S512 = LLT::scalar(512);
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
const LLT V2S8 = LLT::fixed_vector(2, 8);
const LLT V2S16 = LLT::fixed_vector(2, 16);
const LLT V4S16 = LLT::fixed_vector(4, 16);
const LLT V2S32 = LLT::fixed_vector(2, 32);
const LLT V3S32 = LLT::fixed_vector(3, 32);
const LLT V4S32 = LLT::fixed_vector(4, 32);
const LLT V5S32 = LLT::fixed_vector(5, 32);
const LLT V6S32 = LLT::fixed_vector(6, 32);
const LLT V7S32 = LLT::fixed_vector(7, 32);
const LLT V8S32 = LLT::fixed_vector(8, 32);
const LLT V9S32 = LLT::fixed_vector(9, 32);
const LLT V10S32 = LLT::fixed_vector(10, 32);
const LLT V11S32 = LLT::fixed_vector(11, 32);
const LLT V12S32 = LLT::fixed_vector(12, 32);
const LLT V13S32 = LLT::fixed_vector(13, 32);
const LLT V14S32 = LLT::fixed_vector(14, 32);
const LLT V15S32 = LLT::fixed_vector(15, 32);
const LLT V16S32 = LLT::fixed_vector(16, 32);
const LLT V32S32 = LLT::fixed_vector(32, 32);
const LLT V2S64 = LLT::fixed_vector(2, 64);
const LLT V3S64 = LLT::fixed_vector(3, 64);
const LLT V4S64 = LLT::fixed_vector(4, 64);
const LLT V5S64 = LLT::fixed_vector(5, 64);
const LLT V6S64 = LLT::fixed_vector(6, 64);
const LLT V7S64 = LLT::fixed_vector(7, 64);
const LLT V8S64 = LLT::fixed_vector(8, 64);
const LLT V16S64 = LLT::fixed_vector(16, 64);
std::initializer_list<LLT> AllS32Vectors =
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
std::initializer_list<LLT> AllS64Vectors =
{V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
const LLT CodePtr = FlatPtr;
const std::initializer_list<LLT> AddrSpaces64 = {
GlobalPtr, ConstantPtr, FlatPtr
};
const std::initializer_list<LLT> AddrSpaces32 = {
LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
};
const std::initializer_list<LLT> FPTypesBase = {
S32, S64
};
const std::initializer_list<LLT> FPTypes16 = {
S32, S64, S16
};
const std::initializer_list<LLT> FPTypesPK16 = {
S32, S64, S16, V2S16
};
const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
// s1 for VCC branches, s32 for SCC branches.
getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
.legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
.legalFor(AllS32Vectors)
.legalFor(AllS64Vectors)
.legalFor(AddrSpaces64)
.legalFor(AddrSpaces32)
.legalIf(isPointer(0))
.clampScalar(0, S16, S256)
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.scalarize(0);
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16, V2S16})
.minScalar(0, S16)
.clampMaxNumElements(0, S16, 2)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
.legalFor({S32, S16, V2S16}) // Clamp modifier
.minScalarOrElt(0, S16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.lower();
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16})
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
// Technically the saturating operations require clamp bit support, but this
// was introduced at the same time as 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
.legalFor({S32, S16}) // Clamp modifier
.minScalar(0, S16)
.scalarize(0)
.widenScalarToNextPow2(0, 16)
.lower();
// We're just lowering this, but it helps get a better result to try to
// coerce to the desired type first.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
.minScalar(0, S16)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32})
.widenScalarToNextMultipleOf(0, 32)
.clampScalar(0, S32, S32)
.scalarize(0);
if (ST.hasIntClamp()) {
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
.legalFor({S32}) // Clamp modifier.
.scalarize(0)
.minScalarOrElt(0, S32)
.lower();
} else {
// Clamp bit support was added in VI, along with 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
.minScalar(0, S32)
.scalarize(0)
.lower();
}
// FIXME: DAG expansion gets better results. The widening uses the smaller
// range values and goes for the min/max lowering directly.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
.minScalar(0, S32)
.scalarize(0)
.lower();
}
getActionDefinitionsBuilder(
{G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
.customFor({S32, S64})
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
.legalFor({S32})
.maxScalar(0, S32);
if (ST.hasVOP3PInsts()) {
Mulh
.clampMaxNumElements(0, S8, 2)
.lowerFor({V2S8});
}
Mulh
.scalarize(0)
.lower();
// Report legal for any types we can handle anywhere. For the cases only legal
// on the SALU, RegBankSelect will be able to re-legalize.
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
.legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
.clampScalar(0, S32, S64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
.widenScalarToNextPow2(0)
.scalarize(0);
getActionDefinitionsBuilder({G_UADDO, G_USUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
.legalFor({{S32, S1}, {S32, S32}})
.minScalar(0, S32)
// TODO: .scalarize(0)
.lower();
getActionDefinitionsBuilder(G_BITCAST)
// Don't worry about the size constraint.
.legalIf(all(isRegisterType(0), isRegisterType(1)))
.lower();
getActionDefinitionsBuilder(G_CONSTANT)
.legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
.legalIf(isPointer(0))
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({S32, S64, S16})
.clampScalar(0, S16, S64);
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
.legalIf(isRegisterType(0))
// s1 and s16 are special cases because they have legal operations on
// them, but don't really occupy registers in the normal way.
.legalFor({S1, S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampScalarOrElt(0, S32, MaxScalar)
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
.legalFor({{PrivatePtr, S32}});
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
.legalFor({S32, S64});
auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
.customFor({S32, S64});
auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
.customFor({S32, S64});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts())
FPOpActions.legalFor({S16, V2S16});
else
FPOpActions.legalFor({S16});
TrigActions.customFor({S16});
FDIVActions.customFor({S16});
}
auto &MinNumMaxNum = getActionDefinitionsBuilder({
G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
.clampScalar(0, S16, S64)
.scalarize(0);
} else if (ST.has16BitInsts()) {
MinNumMaxNum.customFor(FPTypes16)
.clampScalar(0, S16, S64)
.scalarize(0);
} else {
MinNumMaxNum.customFor(FPTypesBase)
.clampScalar(0, S32, S64)
.scalarize(0);
}
if (ST.hasVOP3PInsts())
FPOpActions.clampMaxNumElements(0, S16, 2);
FPOpActions
.scalarize(0)
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
TrigActions
.scalarize(0)
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
FDIVActions
.scalarize(0)
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
getActionDefinitionsBuilder({G_FNEG, G_FABS})
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0)
.clampScalar(0, S16, S64);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
} else {
getActionDefinitionsBuilder(G_FSQRT)
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
if (ST.hasFractBug()) {
getActionDefinitionsBuilder(G_FFLOOR)
.customFor({S64})
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
} else {
getActionDefinitionsBuilder(G_FFLOOR)
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
}
}
getActionDefinitionsBuilder(G_FPTRUNC)
.legalFor({{S32, S64}, {S16, S32}})
.scalarize(0)
.lower();
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{S64, S32}, {S32, S16}})
.narrowScalarFor({{S64, S16}}, changeTo(0, S32))
.scalarize(0);
getActionDefinitionsBuilder(G_FSUB)
// Use actual fsub instruction
.legalFor({S32})
// Must use fadd + fneg
.lowerFor({S64, S16, V2S16})
.scalarize(0)
.clampScalar(0, S32, S64);
// Whether this is legal depends on the floating point mode for the function.
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
if (ST.hasMadF16() && ST.hasMadMacF32Insts())
FMad.customFor({S32, S16});
else if (ST.hasMadMacF32Insts())
FMad.customFor({S32});
else if (ST.hasMadF16())
FMad.customFor({S16});
FMad.scalarize(0)
.lower();
auto &FRem = getActionDefinitionsBuilder(G_FREM);
if (ST.has16BitInsts()) {
FRem.customFor({S16, S32, S64});
} else {
FRem.minScalar(0, S32)
.customFor({S32, S64});
}
FRem.scalarize(0);
// TODO: Do we need to clamp maximum bitwidth?
getActionDefinitionsBuilder(G_TRUNC)
.legalIf(isScalar(0))
.legalFor({{V2S16, V2S32}})
.clampMaxNumElements(0, S16, 2)
// Avoid scalarizing in cases that should be truly illegal. In unresolvable
// situations (like an invalid implicit use), we don't want to infinite loop
// in the legalizer.
.fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
.alwaysLegal();
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
{S32, S1}, {S64, S1}, {S16, S1}})
.scalarize(0)
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(1, 32);
// TODO: Split s1->s64 during regbankselect for VALU.
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
.lowerIf(typeIs(1, S1))
.customFor({{S32, S64}, {S64, S64}});
if (ST.has16BitInsts())
IToFP.legalFor({{S16, S16}});
IToFP.clampScalar(1, S32, S64)
.minScalar(0, S32)
.scalarize(0)
.widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
.customFor({{S64, S32}, {S64, S64}})
.narrowScalarFor({{S64, S16}}, changeTo(0, S32));
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
else
FPToI.minScalar(1, S32);
FPToI.minScalar(0, S32)
.widenScalarToNextPow2(0, 32)
.scalarize(0)
.lower();
// Lower roundeven into G_FRINT
getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
.scalarize(0)
.lower();
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
.legalFor({S16, S32, S64})
.clampScalar(0, S16, S64)
.scalarize(0);
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
.legalFor({S32, S64})
.clampScalar(0, S32, S64)
.scalarize(0);
} else {
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
.legalFor({S32})
.customFor({S64})
.clampScalar(0, S32, S64)
.scalarize(0);
}
getActionDefinitionsBuilder(G_PTR_ADD)
.legalIf(all(isPointer(0), sameSize(0, 1)))
.scalarize(0)
.scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
.legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
.scalarSameSizeAs(1, 0)
.scalarize(0);
auto &CmpBuilder =
getActionDefinitionsBuilder(G_ICMP)
// The compare output type differs based on the register bank of the output,
// so make both s1 and s32 legal.
//
// Scalar compares producing output in scc will be promoted to s32, as that
// is the allocatable register type that will be needed for the copy from
// scc. This will be promoted during RegBankSelect, and we assume something
// before that won't try to use s32 result types.
//
// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
// bank.
.legalForCartesianProduct(
{S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
.legalForCartesianProduct(
{S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
if (ST.has16BitInsts()) {
CmpBuilder.legalFor({{S1, S16}});
}
CmpBuilder
.widenScalarToNextPow2(1)
.clampScalar(1, S32, S64)
.scalarize(0)
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
getActionDefinitionsBuilder(G_FCMP)
.legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
.widenScalarToNextPow2(1)
.clampScalar(1, S32, S64)
.scalarize(0);
// FIXME: fpow has a selection pattern that should move to custom lowering.
auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
if (ST.has16BitInsts())
Exp2Ops.legalFor({S32, S16});
else
Exp2Ops.legalFor({S32});
Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
Exp2Ops.scalarize(0);
auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
if (ST.has16BitInsts())
ExpOps.customFor({{S32}, {S16}});
else
ExpOps.customFor({S32});
ExpOps.clampScalar(0, MinScalarFPTy, S32)
.scalarize(0);
getActionDefinitionsBuilder(G_FPOWI)
.clampScalar(0, MinScalarFPTy, S32)
.lower();
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
// The hardware instructions return a different result on 0 than the generic
// instructions expect. The hardware produces -1, but these produce the
// bitwidth.
getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
.scalarize(0)
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32)
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
.legalFor({S32, S64})
.clampScalar(0, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({S16, S32, V2S16})
.clampMaxNumElements(0, S16, 2)
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
.clampScalar(0, S16, S32)
.scalarize(0);
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16, V2S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
.minScalar(0, S16)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
.widenScalarToNextPow2(0)
.minScalar(0, S16)
.scalarize(0)
.lower();
}
} else {
// TODO: Should have same legality without v_perm_b32
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({S32})
.lowerIf(scalarNarrowerThan(0, 32))
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
.maxScalar(0, S32)
.scalarize(0)
.lower();
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32})
.minScalar(0, S32)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
}
getActionDefinitionsBuilder(G_INTTOPTR)
// List the common cases
.legalForCartesianProduct(AddrSpaces64, {S64})
.legalForCartesianProduct(AddrSpaces32, {S32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
.widenScalarIf(smallerThan(1, 0),
[](const LegalityQuery &Query) {
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
})
.narrowScalarIf(largerThan(1, 0),
[](const LegalityQuery &Query) {
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
});
getActionDefinitionsBuilder(G_PTRTOINT)
// List the common cases
.legalForCartesianProduct(AddrSpaces64, {S64})
.legalForCartesianProduct(AddrSpaces32, {S32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
.widenScalarIf(smallerThan(0, 1),
[](const LegalityQuery &Query) {
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
})
.narrowScalarIf(
largerThan(0, 1),
[](const LegalityQuery &Query) {
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
});
getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
.scalarize(0)
.custom();
const auto needToSplitMemOp = [=](const LegalityQuery &Query,
bool IsLoad) -> bool {
const LLT DstTy = Query.Types[0];
// Split vector extloads.
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
if (MemSize < DstTy.getSizeInBits())
MemSize = std::max(MemSize, AlignBits);
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
return true;
const LLT PtrTy = Query.Types[1];
unsigned AS = PtrTy.getAddressSpace();
if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
return true;
// Catch weird sized loads that don't evenly divide into the access sizes
// TODO: May be able to widen depending on alignment etc.
unsigned NumRegs = (MemSize + 31) / 32;
if (NumRegs == 3) {
if (!ST.hasDwordx3LoadStores())
return true;
} else {
// If the alignment allows, these should have been widened.
if (!isPowerOf2_32(NumRegs))
return true;
}
if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
Align(AlignBits / 8));
}
return false;
};
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
// TODO: Refine based on subtargets which support unaligned access or 128-bit
// LDS
// TODO: Unsupported flat for SI.
for (unsigned Op : {G_LOAD, G_STORE}) {
const bool IsStore = Op == G_STORE;
auto &Actions = getActionDefinitionsBuilder(Op);
// Explicitly list some common cases.
// TODO: Does this help compile time at all?
Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
{V2S32, GlobalPtr, V2S32, GlobalAlign32},
{V4S32, GlobalPtr, V4S32, GlobalAlign32},
{S64, GlobalPtr, S64, GlobalAlign32},
{V2S64, GlobalPtr, V2S64, GlobalAlign32},
{V2S16, GlobalPtr, V2S16, GlobalAlign32},
{S32, GlobalPtr, S8, GlobalAlign8},
{S32, GlobalPtr, S16, GlobalAlign16},
{S32, LocalPtr, S32, 32},
{S64, LocalPtr, S64, 32},
{V2S32, LocalPtr, V2S32, 32},
{S32, LocalPtr, S8, 8},
{S32, LocalPtr, S16, 16},
{V2S16, LocalPtr, S32, 32},
{S32, PrivatePtr, S32, 32},
{S32, PrivatePtr, S8, 8},
{S32, PrivatePtr, S16, 16},
{V2S16, PrivatePtr, S32, 32},
{S32, ConstantPtr, S32, GlobalAlign32},
{V2S32, ConstantPtr, V2S32, GlobalAlign32},
{V4S32, ConstantPtr, V4S32, GlobalAlign32},
{S64, ConstantPtr, S64, GlobalAlign32},
{V2S32, ConstantPtr, V2S32, GlobalAlign32}});
Actions.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
});
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
// 64-bits.
//
// TODO: Should generalize bitcast action into coerce, which will also cover
// inserting addrspacecasts.
Actions.customIf(typeIs(1, Constant32Ptr));
// Turn any illegal element vectors into something easier to deal
// with. These will ultimately produce 32-bit scalar shifts to extract the
// parts anyway.
//
// For odd 16-bit element vectors, prefer to split those into pieces with
// 16-bit vector parts.
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
return shouldBitcastLoadStoreType(ST, Query.Types[0],
Query.MMODescrs[0].MemoryTy);
}, bitcastToRegisterType(0));
if (!IsStore) {
// Widen suitably aligned loads by loading extra bytes. The standard
// legalization actions can't properly express widening memory operands.
Actions.customIf([=](const LegalityQuery &Query) -> bool {
return shouldWidenLoad(ST, Query, G_LOAD);
});
}
// FIXME: load/store narrowing should be moved to lower action
Actions
.narrowScalarIf(
[=](const LegalityQuery &Query) -> bool {
return !Query.Types[0].isVector() &&
needToSplitMemOp(Query, Op == G_LOAD);
},
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
const LLT DstTy = Query.Types[0];
const LLT PtrTy = Query.Types[1];
const unsigned DstSize = DstTy.getSizeInBits();
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
// Split extloads.
if (DstSize > MemSize)
return std::make_pair(0, LLT::scalar(MemSize));
if (!isPowerOf2_32(DstSize)) {
// We're probably decomposing an odd sized store. Try to split
// to the widest type. TODO: Account for alignment. As-is it
// should be OK, since the new parts will be further legalized.
unsigned FloorSize = PowerOf2Floor(DstSize);
return std::make_pair(0, LLT::scalar(FloorSize));
}
if (DstSize > 32 && (DstSize % 32 != 0)) {
// FIXME: Need a way to specify non-extload of larger size if
// suitably aligned.
return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
}
unsigned MaxSize = maxSizeForAddrSpace(ST,
PtrTy.getAddressSpace(),
Op == G_LOAD);
if (MemSize > MaxSize)
return std::make_pair(0, LLT::scalar(MaxSize));
unsigned Align = Query.MMODescrs[0].AlignInBits;
return std::make_pair(0, LLT::scalar(Align));
})
.fewerElementsIf(
[=](const LegalityQuery &Query) -> bool {
return Query.Types[0].isVector() &&
needToSplitMemOp(Query, Op == G_LOAD);
},
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
const LLT DstTy = Query.Types[0];
const LLT PtrTy = Query.Types[1];
LLT EltTy = DstTy.getElementType();
unsigned MaxSize = maxSizeForAddrSpace(ST,
PtrTy.getAddressSpace(),
Op == G_LOAD);
// FIXME: Handle widened to power of 2 results better. This ends
// up scalarizing.
// FIXME: 3 element stores scalarized on SI
// Split if it's too large for the address space.
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
if (MemSize > MaxSize) {
unsigned NumElts = DstTy.getNumElements();
unsigned EltSize = EltTy.getSizeInBits();
if (MaxSize % EltSize == 0) {
return std::make_pair(
0, LLT::scalarOrVector(
ElementCount::getFixed(MaxSize / EltSize), EltTy));
}
unsigned NumPieces = MemSize / MaxSize;
// FIXME: Refine when odd breakdowns handled
// The scalars will need to be re-legalized.
if (NumPieces == 1 || NumPieces >= NumElts ||
NumElts % NumPieces != 0)
return std::make_pair(0, EltTy);
return std::make_pair(
0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
}
// FIXME: We could probably handle weird extending loads better.
if (DstTy.getSizeInBits() > MemSize)
return std::make_pair(0, EltTy);
unsigned EltSize = EltTy.getSizeInBits();
unsigned DstSize = DstTy.getSizeInBits();
if (!isPowerOf2_32(DstSize)) {
// We're probably decomposing an odd sized store. Try to split
// to the widest type. TODO: Account for alignment. As-is it
// should be OK, since the new parts will be further legalized.
unsigned FloorSize = PowerOf2Floor(DstSize);
return std::make_pair(
0, LLT::scalarOrVector(
ElementCount::getFixed(FloorSize / EltSize), EltTy));
}
// Need to split because of alignment.
unsigned Align = Query.MMODescrs[0].AlignInBits;
if (EltSize > Align &&
(EltSize / Align < DstTy.getNumElements())) {
return std::make_pair(
0, LLT::fixed_vector(EltSize / Align, EltTy));
}
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
.minScalar(0, S32)
.narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
.widenScalarToNextPow2(0)
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
.lower();
}
// FIXME: Unaligned accesses not lowered.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
.legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
{S32, GlobalPtr, S16, 2 * 8},
{S32, LocalPtr, S8, 8},
{S32, LocalPtr, S16, 16},
{S32, PrivatePtr, S8, 8},
{S32, PrivatePtr, S16, 16},
{S32, ConstantPtr, S8, 8},
{S32, ConstantPtr, S16, 2 * 8}})
.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
});
if (ST.hasFlatAddressSpace()) {
ExtLoads.legalForTypesWithMemDesc(
{{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
}
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
// 64-bits.
//
// TODO: Should generalize bitcast action into coerce, which will also cover
// inserting addrspacecasts.
ExtLoads.customIf(typeIs(1, Constant32Ptr));
ExtLoads.clampScalar(0, S32, S32)
.widenScalarToNextPow2(0)
.lower();
auto &Atomics = getActionDefinitionsBuilder(
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
G_ATOMICRMW_UMIN})
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
{S64, GlobalPtr}, {S64, LocalPtr},
{S32, RegionPtr}, {S64, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAdd()) {
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasGFX90AInsts())
Atomic.legalFor({{S64, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
Atomic.legalFor({{S32, GlobalPtr}});
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
.customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
{S32, FlatPtr}, {S64, FlatPtr}})
.legalFor({{S32, LocalPtr}, {S64, LocalPtr},
{S32, RegionPtr}, {S64, RegionPtr}});
// TODO: Pointer types, any 32-bit or 64-bit vector
// Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
LocalPtr, FlatPtr, PrivatePtr,
LLT::fixed_vector(2, LocalPtr),
LLT::fixed_vector(2, PrivatePtr)},
{S1, S32})
.clampScalar(0, S16, S64)
.scalarize(1)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
.clampMaxNumElements(0, S32, 2)
.clampMaxNumElements(0, LocalPtr, 2)
.clampMaxNumElements(0, PrivatePtr, 2)
.scalarize(0)
.widenScalarToNextPow2(0)
.legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
// be more flexible with the shift amount type.
auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
.legalFor({{S32, S32}, {S64, S32}});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts()) {
Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
.clampMaxNumElements(0, S16, 2);
} else
Shifts.legalFor({{S16, S16}});
// TODO: Support 16-bit shift amounts for all types
Shifts.widenScalarIf(
[=](const LegalityQuery &Query) {
// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
// 32-bit amount.
const LLT ValTy = Query.Types[0];
const LLT AmountTy = Query.Types[1];
return ValTy.getSizeInBits() <= 16 &&
AmountTy.getSizeInBits() < 16;
}, changeTo(1, S16));
Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S16, S64);
Shifts.widenScalarToNextPow2(0, 16);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
.minScalar(0, S16)
.scalarize(0)
.lower();
} else {
// Make sure we legalize the shift amount type first, as the general
// expansion for the shifted type will produce much worse code if it hasn't
// been truncated already.
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S32, S64);
Shifts.widenScalarToNextPow2(0, 32);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
.minScalar(0, S32)
.scalarize(0)
.lower();
}
Shifts.scalarize(0);
for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
unsigned IdxTypeIdx = 2;
getActionDefinitionsBuilder(Op)
.customIf([=](const LegalityQuery &Query) {
const LLT EltTy = Query.Types[EltTypeIdx];
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
const unsigned EltSize = EltTy.getSizeInBits();
return (EltSize == 32 || EltSize == 64) &&
VecTy.getSizeInBits() % 32 == 0 &&
VecTy.getSizeInBits() <= MaxRegisterSize &&
IdxTy.getSizeInBits() == 32;
})
.bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
bitcastToVectorElement32(VecTypeIdx))
//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
.bitcastIf(
all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
[=](const LegalityQuery &Query) {
// For > 64-bit element types, try to turn this into a 64-bit
// element vector since we may be able to do better indexing
// if this is scalar. If not, fall back to 32.
const LLT EltTy = Query.Types[EltTypeIdx];
const LLT VecTy = Query.Types[VecTypeIdx];
const unsigned DstEltSize = EltTy.getSizeInBits();
const unsigned VecSize = VecTy.getSizeInBits();
const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
return std::make_pair(
VecTypeIdx,
LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
})
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
.clampScalar(IdxTypeIdx, S32, S32)
.clampMaxNumElements(VecTypeIdx, S32, 32)
// TODO: Clamp elements for 64-bit vectors?
// It should only be necessary with variable indexes.
// As a last resort, lower to the stack
.lower();
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
.unsupportedIf([=](const LegalityQuery &Query) {
const LLT &EltTy = Query.Types[1].getElementType();
return Query.Types[0] != EltTy;
});
for (unsigned Op : {G_EXTRACT, G_INSERT}) {
unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
.lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
// FIXME: Multiples of 16 should not be legal.
.legalIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
const LLT LitTy = Query.Types[LitTyIdx];
return (BigTy.getSizeInBits() % 32 == 0) &&
(LitTy.getSizeInBits() % 16 == 0);
})
.widenScalarIf(
[=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
return (BigTy.getScalarSizeInBits() < 16);
},
LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
.widenScalarIf(
[=](const LegalityQuery &Query) {
const LLT LitTy = Query.Types[LitTyIdx];
return (LitTy.getScalarSizeInBits() < 16);
},
LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
.widenScalarToNextPow2(BigTyIdx, 32);
}
auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
.legalForCartesianProduct(AllS32Vectors, {S32})
.legalForCartesianProduct(AllS64Vectors, {S64})
.clampNumElements(0, V16S32, V32S32)
.clampNumElements(0, V2S64, V16S64)
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
if (ST.hasScalarPackInsts()) {
BuildVector
// FIXME: Should probably widen s1 vectors straight to s32
.minScalarOrElt(0, S16)
// Widen source elements and produce a G_BUILD_VECTOR_TRUNC
.minScalar(1, S32);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
.legalFor({V2S16, S32})
.lower();
BuildVector.minScalarOrElt(0, S32);
} else {
BuildVector.customFor({V2S16, S16});
BuildVector.minScalarOrElt(0, S32);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
.customFor({V2S16, S32})
.lower();
}
BuildVector.legalIf(isRegisterType(0));
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
.clampMaxNumElements(0, S32, 32)
.clampMaxNumElements(1, S16, 2) // TODO: Make 4?
.clampMaxNumElements(0, S16, 64);
// TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
// pre-legalize.
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
.customFor({V2S16, V2S16})
.lower();
} else
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
const LLT Ty = Query.Types[TypeIdx];
if (Ty.isVector()) {
const LLT &EltTy = Ty.getElementType();
if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
return true;
if (!isPowerOf2_32(EltTy.getSizeInBits()))
return true;
}
return false;
};
auto &Builder = getActionDefinitionsBuilder(Op)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
.lowerFor({{S16, V2S16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
return BigTy.getSizeInBits() == 32;
})
// Try to widen to s16 first for small types.
// TODO: Only do this on targets with legal s16 shifts
.minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
.fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
elementTypeIs(1, S16)),
changeTo(1, V2S16))
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
// worth considering the multiples of 64 since 2*192 and 2*384 are not
// valid.
.clampScalar(LitTyIdx, S32, S512)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
// Break up vectors with weird elements into scalars
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
scalarize(0))
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
scalarize(1))
.clampScalar(BigTyIdx, S32, MaxScalar);
if (Op == G_MERGE_VALUES) {
Builder.widenScalarIf(
// TODO: Use 16-bit shifts if legal for 8-bit values?
[=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[LitTyIdx];
return Ty.getSizeInBits() < 32;
},
changeTo(LitTyIdx, S32));
}
Builder.widenScalarIf(
[=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[BigTyIdx];
return !isPowerOf2_32(Ty.getSizeInBits()) &&
Ty.getSizeInBits() % 16 != 0;
},
[=](const LegalityQuery &Query) {
// Pick the next power of 2, or a multiple of 64 over 128.
// Whichever is smaller.
const LLT &Ty = Query.Types[BigTyIdx];
unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
if (NewSizeInBits >= 256) {
unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
if (RoundedTo < NewSizeInBits)
NewSizeInBits = RoundedTo;
}
return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
})
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
}
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
.legalFor({{S32}, {S64}});
if (ST.hasVOP3PInsts()) {
SextInReg.lowerFor({{V2S16}})
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
// get more vector shift opportunities, since we'll get those when
// expanded.
.fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
} else if (ST.has16BitInsts()) {
SextInReg.lowerFor({{S32}, {S64}, {S16}});
} else {
// Prefer to promote to s32 before lowering if we don't have 16-bit
// shifts. This avoid a lot of intermediate truncate and extend operations.
SextInReg.lowerFor({{S32}, {S64}});
}
SextInReg
.scalarize(0)
.clampScalar(0, S32, S64)
.lower();
getActionDefinitionsBuilder({G_ROTR, G_ROTL})
.scalarize(0)
.lower();
// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
.legalFor({{S32, S32}})
.lowerFor({{V2S16, V2S16}})
.fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
.scalarize(0)
.lower();
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_FSHL)
.lowerFor({{V2S16, V2S16}})
.fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder(G_FSHL)
.scalarize(0)
.lower();
}
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
getActionDefinitionsBuilder(G_FENCE)
.alwaysLegal();
getActionDefinitionsBuilder({G_SMULO, G_UMULO})
.scalarize(0)
.minScalar(0, S32)
.lower();
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
.legalFor({{S32, S32}, {S64, S32}})
.clampScalar(1, S32, S32)
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0)
.scalarize(0);
getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
G_ATOMIC_CMPXCHG_WITH_SUCCESS,
G_ATOMICRMW_NAND,
G_ATOMICRMW_FSUB,
G_READ_REGISTER,
G_WRITE_REGISTER,
G_SADDO, G_SSUBO,
// TODO: Implement
G_FMINIMUM, G_FMAXIMUM}).lower();
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
return legalizeAddrSpaceCast(MI, MRI, B);
case TargetOpcode::G_FRINT:
return legalizeFrint(MI, MRI, B);
case TargetOpcode::G_FCEIL:
return legalizeFceil(MI, MRI, B);
case TargetOpcode::G_FREM:
return legalizeFrem(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_TRUNC:
return legalizeIntrinsicTrunc(MI, MRI, B);
case TargetOpcode::G_SITOFP:
return legalizeITOFP(MI, MRI, B, true);
case TargetOpcode::G_UITOFP:
return legalizeITOFP(MI, MRI, B, false);
case TargetOpcode::G_FPTOSI:
return legalizeFPTOI(MI, MRI, B, true);
case TargetOpcode::G_FPTOUI:
return legalizeFPTOI(MI, MRI, B, false);
case TargetOpcode::G_FMINNUM:
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
case TargetOpcode::G_INSERT_VECTOR_ELT:
return legalizeInsertVectorElt(MI, MRI, B);
case TargetOpcode::G_SHUFFLE_VECTOR:
return legalizeShuffleVector(MI, MRI, B);
case TargetOpcode::G_FSIN:
case TargetOpcode::G_FCOS:
return legalizeSinCos(MI, MRI, B);
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeGlobalValue(MI, MRI, B);
case TargetOpcode::G_LOAD:
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
case TargetOpcode::G_FDIV:
return legalizeFDIV(MI, MRI, B);
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
case TargetOpcode::G_UDIVREM:
return legalizeUnsignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_SDIV:
case TargetOpcode::G_SREM:
case TargetOpcode::G_SDIVREM:
return legalizeSignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
case TargetOpcode::G_FLOG:
return legalizeFlog(MI, B, numbers::ln2f);
case TargetOpcode::G_FLOG10:
return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
case TargetOpcode::G_FEXP:
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
return legalizeFPow(MI, B);
case TargetOpcode::G_FFLOOR:
return legalizeFFloor(MI, MRI, B);
case TargetOpcode::G_BUILD_VECTOR:
return legalizeBuildVector(MI, MRI, B);
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
default:
return false;
}
llvm_unreachable("expected switch to return");
}
Register AMDGPULegalizerInfo::getSegmentAperture(
unsigned AS,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const LLT S32 = LLT::scalar(32);
assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
if (ST.hasApertureRegs()) {
// FIXME: Use inline constants (src_{shared, private}_base) instead of
// getreg.
unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
unsigned Encoding =
AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::S_GETREG_B32)
.addDef(GetReg)
.addImm(Encoding);
MRI.setType(GetReg, S32);
auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
}
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
// TODO: can we be smarter about machine pointer info?
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
Register LoadAddr;
B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);
unsigned DestAS = DstTy.getAddressSpace();
unsigned SrcAS = SrcTy.getAddressSpace();
// TODO: Avoid reloading from the queue ptr for each cast, or at least each
// vector element.
assert(!DstTy.isVector());
const AMDGPUTargetMachine &TM
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
return true;
}
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
// Truncate.
B.buildExtract(Dst, Src, 0);
MI.eraseFromParent();
return true;
}
if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
// FIXME: This is a bit ugly due to creating a merge of 2 pointers to
// another. Merge operands are required to be the same type, but creating an
// extra ptrtoint would be kind of pointless.
auto HighAddr = B.buildConstant(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
B.buildMerge(Dst, {Src, HighAddr});
MI.eraseFromParent();
return true;
}
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS);
unsigned NullVal = TM.getNullPointerValue(DestAS);
auto SegmentNull = B.buildConstant(DstTy, NullVal);
auto FlatNull = B.buildConstant(SrcTy, 0);
// Extract low 32-bits of the pointer.
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
return true;
}
if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
return false;
if (!ST.hasFlatAddressSpace())
return false;
auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
if (!ApertureReg.isValid())
return false;
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
// Coerce the type of the low half of the result so we can use merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeFrint(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
auto C1 = B.buildFConstant(Ty, C1Val);
auto CopySign = B.buildFCopysign(Ty, C1, Src);
// TODO: Should this propagate fast-math-flags?
auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
auto C2 = B.buildFConstant(Ty, C2Val);
auto Fabs = B.buildFAbs(Ty, Src);
auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeFceil(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
const LLT S1 = LLT::scalar(1);
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
assert(MRI.getType(Src) == S64);
// result = trunc(src)
// if (src > 0.0 && src != result)
// result += 1.0
auto Trunc = B.buildIntrinsicTrunc(S64, Src);
const auto Zero = B.buildFConstant(S64, 0.0);
const auto One = B.buildFConstant(S64, 1.0);
auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
auto And = B.buildAnd(S1, Lt0, NeTrunc);
auto Add = B.buildSelect(S64, And, One, Zero);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
return true;
}
bool AMDGPULegalizerInfo::legalizeFrem(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register DstReg = MI.getOperand(0).getReg();
Register Src0Reg = MI.getOperand(1).getReg();
Register Src1Reg = MI.getOperand(2).getReg();
auto Flags = MI.getFlags();
LLT Ty = MRI.getType(DstReg);
auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
auto Neg = B.buildFNeg(Ty, Trunc, Flags);
B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
MI.eraseFromParent();
return true;
}
static MachineInstrBuilder extractF64Exponent(Register Hi,
MachineIRBuilder &B) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
LLT S32 = LLT::scalar(32);
auto Const0 = B.buildConstant(S32, FractBits - 32);
auto Const1 = B.buildConstant(S32, ExpBits);
auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
.addUse(Hi)
.addUse(Const0.getReg(0))
.addUse(Const1.getReg(0));
return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
}
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
const LLT S1 = LLT::scalar(1);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
assert(MRI.getType(Src) == S64);
// TODO: Should this use extract since the low half is unused?
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
Register Hi = Unmerge.getReg(1);
// Extract the upper half, since this is where we will find the sign and
// exponent.
auto Exp = extractF64Exponent(Hi, B);
const unsigned FractBits = 52;
// Extract the sign bit.
const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
const auto Zero32 = B.buildConstant(S32, 0);
// Extend back to 64-bits.
auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
auto Shr = B.buildAShr(S64, FractMask, Exp);
auto Not = B.buildNot(S64, Shr);
auto Tmp0 = B.buildAnd(S64, Src, Not);
auto FiftyOne = B.buildConstant(S32, FractBits - 1);
auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeITOFP(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool Signed) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
assert(MRI.getType(Src) == S64);
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
auto ThirtyTwo = B.buildConstant(S32, 32);
if (MRI.getType(Dst) == S64) {
auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
: B.buildUITOFP(S64, Unmerge.getReg(1));
auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
.addUse(CvtHi.getReg(0))
.addUse(ThirtyTwo.getReg(0));
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(Dst, LdExp, CvtLo);
MI.eraseFromParent();
return true;
}
assert(MRI.getType(Dst) == S32);
auto One = B.buildConstant(S32, 1);
MachineInstrBuilder ShAmt;
if (Signed) {
auto ThirtyOne = B.buildConstant(S32, 31);
auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
/*HasSideEffects=*/false)
.addUse(Unmerge.getReg(1));
auto LS2 = B.buildSub(S32, LS, One);
ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
} else
ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
auto Norm = B.buildShl(S64, Src, ShAmt);
auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
/*HasSideEffects=*/false)
.addUse(FVal.getReg(0))
.addUse(Scale.getReg(0));
MI.eraseFromParent();
return true;
}
// TODO: Copied from DAG implementation. Verify logic and document how this
// actually works.
bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
bool Signed) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
const LLT SrcLT = MRI.getType(Src);
assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
unsigned Flags = MI.getFlags();
// The basic idea of converting a floating point number into a pair of 32-bit
// integers is illustrated as follows:
//
// tf := trunc(val);
// hif := floor(tf * 2^-32);
// lof := tf - hif * 2^32; // lof is always positive due to floor.
// hi := fptoi(hif);
// lo := fptoi(lof);
//
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
MachineInstrBuilder Sign;
if (Signed && SrcLT == S32) {
// However, a 32-bit floating point number has only 23 bits mantissa and
// it's not enough to hold all the significant bits of `lof` if val is
// negative. To avoid the loss of precision, We need to take the absolute
// value after truncating and flip the result back based on the original
// signedness.
Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
Trunc = B.buildFAbs(S32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
if (SrcLT == S64) {
K0 = B.buildFConstant(S64,
BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
K1 = B.buildFConstant(S64,
BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
} else {
K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
}
auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
: B.buildFPTOUI(S32, FloorMul);
auto Lo = B.buildFPTOUI(S32, Fma);
if (Signed && SrcLT == S32) {
// Flip the result based on the signedness, which is either all 0s or 1s.
Sign = B.buildMerge(S64, {Sign, Sign});
// r := xor({lo, hi}, sign) - sign;
B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
} else
B.buildMerge(Dst, {Lo, Hi});
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
// With ieee_mode disabled, the instructions have the correct behavior
// already for G_FMINNUM/G_FMAXNUM
if (!MFI->getMode().IEEE)
return !IsIEEEOp;
if (IsIEEEOp)
return true;
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
}
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// TODO: Should move some of this into LegalizerHelper.
// TODO: Promote dynamic indexing of s16 to s32
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getIConstantVRegValWithLookThrough.
Optional<ValueAndVReg> MaybeIdxVal =
getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
LLT VecTy = MRI.getType(Vec);
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Dst));
if (IdxVal < VecTy.getNumElements())
B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// TODO: Should move some of this into LegalizerHelper.
// TODO: Promote dynamic indexing of s16 to s32
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getIConstantVRegValWithLookThrough.
Optional<ValueAndVReg> MaybeIdxVal =
getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
Register Ins = MI.getOperand(2).getReg();
LLT VecTy = MRI.getType(Vec);
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Ins));
if (IdxVal < VecTy.getNumElements())
B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::legalizeShuffleVector(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src0);
if (SrcTy == V2S16 && DstTy == V2S16 &&
AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
return true;
MachineIRBuilder HelperBuilder(MI);
GISelObserverWrapper DummyObserver;
LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
}
bool AMDGPULegalizerInfo::legalizeSinCos(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
unsigned Flags = MI.getFlags();
Register TrigVal;
auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
if (ST.hasTrigReducedRange()) {
auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
.addUse(MulVal.getReg(0))
.setMIFlags(Flags).getReg(0);
} else
TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
.addUse(TrigVal)
.setMIFlags(Flags);
MI.eraseFromParent();
return true;
}
bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
MachineIRBuilder &B,
const GlobalValue *GV,
int64_t Offset,
unsigned GAFlags) const {
assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
// to the following code sequence:
//
// For constant address space:
// s_getpc_b64 s[0:1]
// s_add_u32 s0, s0, $symbol
// s_addc_u32 s1, s1, 0
//
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
// a fixup or relocation is emitted to replace $symbol with a literal
// constant, which is a pc-relative offset from the encoding of the $symbol
// operand to the global variable.
//
// For global address space:
// s_getpc_b64 s[0:1]
// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
//
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
// fixups or relocations are emitted to replace $symbol@*@lo and
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
//
// What we want here is an offset from the value returned by s_getpc
// (which is the address of the s_add_u32 instruction) to the global
// variable, but since the encoding of $symbol starts 4 bytes after the start
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address. Similarly for the s_addc_u32 instruction, the
// encoding of $symbol starts 12 bytes after the start of the s_add_u32
// instruction.
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
.addDef(PCReg);
MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
if (GAFlags == SIInstrInfo::MO_NONE)
MIB.addImm(0);
else
MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
if (PtrTy.getSizeInBits() == 32)
B.buildExtract(DstReg, PCReg, 0);
return true;
}
bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
unsigned AS = Ty.getAddressSpace();
const GlobalValue *GV = MI.getOperand(1).getGlobal();
MachineFunction &MF = B.getMF();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
!GV->getName().equals("llvm.amdgcn.module.lds")) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
DS_Warning);
Fn.getContext().diagnose(BadLDSDecl);
// We currently don't have a way to correctly allocate LDS objects that
// aren't directly associated with a kernel. We do force inlining of
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
B.buildUndef(DstReg);
MI.eraseFromParent();
return true;
}
// TODO: We could emit code to handle the initialization somewhere.
// We ignore the initializer for now and legalize it to allow selection.
// The initializer will anyway get errored out during assembly emission.
const SITargetLowering *TLI = ST.getTargetLowering();
if (!TLI->shouldUseLDSConstAddress(GV)) {
MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
return true; // Leave in place;
}
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
Type *Ty = GV->getValueType();
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
// memory which size is not known at the compile time. They will be
// allocated by the runtime and placed directly after the static
// allocated ones. They all share the same offset.
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
// Adjust alignment for that dynamic shared memory array.
MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
LLT S32 = LLT::scalar(32);
auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
B.buildIntToPtr(DstReg, Sz);
MI.eraseFromParent();
return true;
}
}
B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
*cast<GlobalVariable>(GV)));
MI.eraseFromParent();
return true;
}
const SITargetLowering *TLI = ST.getTargetLowering();
if (TLI->shouldEmitFixup(GV)) {
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
MI.eraseFromParent();
return true;
}
if (TLI->shouldEmitPCReloc(GV)) {
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
MI.eraseFromParent();
return true;
}
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
MachinePointerInfo::getGOT(MF),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LoadTy, Align(8));
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
if (Ty.getSizeInBits() == 32) {
// Truncate if this is a 32-bit constant address.
auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
B.buildExtract(DstReg, Load, 0);
} else
B.buildLoad(DstReg, GOTAddr, *GOTMMO);
MI.eraseFromParent();
return true;
}
static LLT widenToNextPowerOf2(LLT Ty) {
if (Ty.isVector())
return Ty.changeElementCount(
ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
}
bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
GISelChangeObserver &Observer = Helper.Observer;
Register PtrReg = MI.getOperand(1).getReg();
LLT PtrTy = MRI.getType(PtrReg);
unsigned AddrSpace = PtrTy.getAddressSpace();
if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
Observer.changingInstr(MI);
MI.getOperand(1).setReg(Cast.getReg(0));
Observer.changedInstr(MI);
return true;
}
if (MI.getOpcode() != AMDGPU::G_LOAD)
return false;
Register ValReg = MI.getOperand(0).getReg();
LLT ValTy = MRI.getType(ValReg);
MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned ValSize = ValTy.getSizeInBits();
const LLT MemTy = MMO->getMemoryType();
const Align MemAlign = MMO->getAlign();
const unsigned MemSize = MemTy.getSizeInBits();
const unsigned AlignInBits = 8 * MemAlign.value();
// Widen non-power-of-2 loads to the alignment if needed
if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
const unsigned WideMemSize = PowerOf2Ceil(MemSize);
// This was already the correct extending load result type, so just adjust
// the memory type.
if (WideMemSize == ValSize) {
MachineFunction &MF = B.getMF();
MachineMemOperand *WideMMO =
MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
Observer.changingInstr(MI);
MI.setMemRefs(MF, {WideMMO});
Observer.changedInstr(MI);
return true;
}
// Don't bother handling edge case that should probably never be produced.
if (ValSize > WideMemSize)
return false;
LLT WideTy = widenToNextPowerOf2(ValTy);
Register WideLoad;
if (!WideTy.isVector()) {
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
B.buildTrunc(ValReg, WideLoad).getReg(0);
} else {
// Extract the subvector.
if (isRegisterType(ValTy)) {
// If this a case where G_EXTRACT is legal, use it.
// (e.g. <3 x s32> -> <4 x s32>)
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
B.buildExtract(ValReg, WideLoad, 0);
} else {
// For cases where the widened type isn't a nice register value, unmerge
// from a widened register (e.g. <3 x s16> -> <4 x s16>)
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
B.setInsertPt(B.getMBB(), MI.getIterator());
B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
}
}
MI.eraseFromParent();
return true;
}
return false;
}
bool AMDGPULegalizerInfo::legalizeFMad(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
assert(Ty.isScalar());