| //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// \file |
| /// This file implements the targeting of the Machinelegalizer class for |
| /// AMDGPU. |
| /// \todo This should be generated by TableGen. |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPULegalizerInfo.h" |
| |
| #include "AMDGPU.h" |
| #include "AMDGPUGlobalISelUtils.h" |
| #include "AMDGPUInstrInfo.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/ADT/ScopeExit.h" |
| #include "llvm/BinaryFormat/ELF.h" |
| #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
| #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
| #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| |
| #define DEBUG_TYPE "amdgpu-legalinfo" |
| |
| using namespace llvm; |
| using namespace LegalizeActions; |
| using namespace LegalizeMutations; |
| using namespace LegalityPredicates; |
| using namespace MIPatternMatch; |
| |
| // Hack until load/store selection patterns support any tuple of legal types. |
| static cl::opt<bool> EnableNewLegality( |
| "amdgpu-global-isel-new-legality", |
| cl::desc("Use GlobalISel desired legality, rather than try to use" |
| "rules compatible with selection patterns"), |
| cl::init(false), |
| cl::ReallyHidden); |
| |
| static constexpr unsigned MaxRegisterSize = 1024; |
| |
| // Round the number of elements to the next power of two elements |
| static LLT getPow2VectorType(LLT Ty) { |
| unsigned NElts = Ty.getNumElements(); |
| unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); |
| return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); |
| } |
| |
| // Round the number of bits to the next power of two bits |
| static LLT getPow2ScalarType(LLT Ty) { |
| unsigned Bits = Ty.getSizeInBits(); |
| unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); |
| return LLT::scalar(Pow2Bits); |
| } |
| |
| /// \returns true if this is an odd sized vector which should widen by adding an |
| /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This |
| /// excludes s1 vectors, which should always be scalarized. |
| static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| if (!Ty.isVector()) |
| return false; |
| |
| const LLT EltTy = Ty.getElementType(); |
| const unsigned EltSize = EltTy.getSizeInBits(); |
| return Ty.getNumElements() % 2 != 0 && |
| EltSize > 1 && EltSize < 32 && |
| Ty.getSizeInBits() % 32 != 0; |
| }; |
| } |
| |
| static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| return Ty.getSizeInBits() % 32 == 0; |
| }; |
| } |
| |
| static LegalityPredicate isWideVec16(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getScalarType(); |
| return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; |
| }; |
| } |
| |
| static LegalizeMutation oneMoreElement(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getElementType(); |
| return std::make_pair(TypeIdx, |
| LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); |
| }; |
| } |
| |
| static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getElementType(); |
| unsigned Size = Ty.getSizeInBits(); |
| unsigned Pieces = (Size + 63) / 64; |
| unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; |
| return std::make_pair( |
| TypeIdx, |
| LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); |
| }; |
| } |
| |
| // Increase the number of vector elements to reach the next multiple of 32-bit |
| // type. |
| static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| |
| const LLT EltTy = Ty.getElementType(); |
| const int Size = Ty.getSizeInBits(); |
| const int EltSize = EltTy.getSizeInBits(); |
| const int NextMul32 = (Size + 31) / 32; |
| |
| assert(EltSize < 32); |
| |
| const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; |
| return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); |
| }; |
| } |
| |
| static LLT getBitcastRegisterType(const LLT Ty) { |
| const unsigned Size = Ty.getSizeInBits(); |
| |
| LLT CoercedTy; |
| if (Size <= 32) { |
| // <2 x s8> -> s16 |
| // <4 x s8> -> s32 |
| return LLT::scalar(Size); |
| } |
| |
| return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); |
| } |
| |
| static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); |
| }; |
| } |
| |
| static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| unsigned Size = Ty.getSizeInBits(); |
| assert(Size % 32 == 0); |
| return std::make_pair( |
| TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); |
| }; |
| } |
| |
| static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; |
| }; |
| } |
| |
| static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; |
| }; |
| } |
| |
| static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; |
| }; |
| } |
| |
| static bool isRegisterSize(unsigned Size) { |
| return Size % 32 == 0 && Size <= MaxRegisterSize; |
| } |
| |
| static bool isRegisterVectorElementType(LLT EltTy) { |
| const int EltSize = EltTy.getSizeInBits(); |
| return EltSize == 16 || EltSize % 32 == 0; |
| } |
| |
| static bool isRegisterVectorType(LLT Ty) { |
| const int EltSize = Ty.getElementType().getSizeInBits(); |
| return EltSize == 32 || EltSize == 64 || |
| (EltSize == 16 && Ty.getNumElements() % 2 == 0) || |
| EltSize == 128 || EltSize == 256; |
| } |
| |
| static bool isRegisterType(LLT Ty) { |
| if (!isRegisterSize(Ty.getSizeInBits())) |
| return false; |
| |
| if (Ty.isVector()) |
| return isRegisterVectorType(Ty); |
| |
| return true; |
| } |
| |
| // Any combination of 32 or 64-bit elements up the maximum register size, and |
| // multiples of v2s16. |
| static LegalityPredicate isRegisterType(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| return isRegisterType(Query.Types[TypeIdx]); |
| }; |
| } |
| |
| static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| if (!QueryTy.isVector()) |
| return false; |
| const LLT EltTy = QueryTy.getElementType(); |
| return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; |
| }; |
| } |
| |
| // If we have a truncating store or an extending load with a data size larger |
| // than 32-bits, we need to reduce to a 32-bit type. |
| static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| return !Ty.isVector() && Ty.getSizeInBits() > 32 && |
| Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); |
| }; |
| } |
| |
| // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we |
| // handle some operations by just promoting the register during |
| // selection. There are also d16 loads on GFX9+ which preserve the high bits. |
| static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, |
| bool IsLoad) { |
| switch (AS) { |
| case AMDGPUAS::PRIVATE_ADDRESS: |
| // FIXME: Private element size. |
| return ST.enableFlatScratch() ? 128 : 32; |
| case AMDGPUAS::LOCAL_ADDRESS: |
| return ST.useDS128() ? 128 : 64; |
| case AMDGPUAS::GLOBAL_ADDRESS: |
| case AMDGPUAS::CONSTANT_ADDRESS: |
| case AMDGPUAS::CONSTANT_ADDRESS_32BIT: |
| // Treat constant and global as identical. SMRD loads are sometimes usable for |
| // global loads (ideally constant address space should be eliminated) |
| // depending on the context. Legality cannot be context dependent, but |
| // RegBankSelect can split the load as necessary depending on the pointer |
| // register bank/uniformity and if the memory is invariant or not written in a |
| // kernel. |
| return IsLoad ? 512 : 128; |
| default: |
| // Flat addresses may contextually need to be split to 32-bit parts if they |
| // may alias scratch depending on the subtarget. |
| return 128; |
| } |
| } |
| |
| static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, |
| const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[0]; |
| |
| // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD |
| const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; |
| |
| unsigned RegSize = Ty.getSizeInBits(); |
| unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
| unsigned AlignBits = Query.MMODescrs[0].AlignInBits; |
| unsigned AS = Query.Types[1].getAddressSpace(); |
| |
| // All of these need to be custom lowered to cast the pointer operand. |
| if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) |
| return false; |
| |
| // Do not handle extending vector loads. |
| if (Ty.isVector() && MemSize != RegSize) |
| return false; |
| |
| // TODO: We should be able to widen loads if the alignment is high enough, but |
| // we also need to modify the memory access size. |
| #if 0 |
| // Accept widening loads based on alignment. |
| if (IsLoad && MemSize < Size) |
| MemSize = std::max(MemSize, Align); |
| #endif |
| |
| // Only 1-byte and 2-byte to 32-bit extloads are valid. |
| if (MemSize != RegSize && RegSize != 32) |
| return false; |
| |
| if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) |
| return false; |
| |
| switch (MemSize) { |
| case 8: |
| case 16: |
| case 32: |
| case 64: |
| case 128: |
| break; |
| case 96: |
| if (!ST.hasDwordx3LoadStores()) |
| return false; |
| break; |
| case 256: |
| case 512: |
| // These may contextually need to be broken down. |
| break; |
| default: |
| return false; |
| } |
| |
| assert(RegSize >= MemSize); |
| |
| if (AlignBits < MemSize) { |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, |
| Align(AlignBits / 8))) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so |
| // workaround this. Eventually it should ignore the type for loads and only care |
| // about the size. Return true in cases where we will workaround this for now by |
| // bitcasting. |
| static bool loadStoreBitcastWorkaround(const LLT Ty) { |
| if (EnableNewLegality) |
| return false; |
| |
| const unsigned Size = Ty.getSizeInBits(); |
| if (Size <= 64) |
| return false; |
| if (!Ty.isVector()) |
| return true; |
| |
| LLT EltTy = Ty.getElementType(); |
| if (EltTy.isPointer()) |
| return true; |
| |
| unsigned EltSize = EltTy.getSizeInBits(); |
| return EltSize != 32 && EltSize != 64; |
| } |
| |
| static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[0]; |
| return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && |
| !loadStoreBitcastWorkaround(Ty); |
| } |
| |
| /// Return true if a load or store of the type should be lowered with a bitcast |
| /// to a different type. |
| static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, |
| const LLT MemTy) { |
| const unsigned MemSizeInBits = MemTy.getSizeInBits(); |
| const unsigned Size = Ty.getSizeInBits(); |
| if (Size != MemSizeInBits) |
| return Size <= 32 && Ty.isVector(); |
| |
| if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) |
| return true; |
| |
| // Don't try to handle bitcasting vector ext loads for now. |
| return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && |
| (Size <= 32 || isRegisterSize(Size)) && |
| !isRegisterVectorElementType(Ty.getElementType()); |
| } |
| |
| /// Return true if we should legalize a load by widening an odd sized memory |
| /// access up to the alignment. Note this case when the memory access itself |
| /// changes, not the size of the result register. |
| static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, |
| unsigned AlignInBits, unsigned AddrSpace, |
| unsigned Opcode) { |
| unsigned SizeInBits = MemoryTy.getSizeInBits(); |
| // We don't want to widen cases that are naturally legal. |
| if (isPowerOf2_32(SizeInBits)) |
| return false; |
| |
| // If we have 96-bit memory operations, we shouldn't touch them. Note we may |
| // end up widening these for a scalar load during RegBankSelect, since there |
| // aren't 96-bit scalar loads. |
| if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) |
| return false; |
| |
| if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) |
| return false; |
| |
| // A load is known dereferenceable up to the alignment, so it's legal to widen |
| // to it. |
| // |
| // TODO: Could check dereferenceable for less aligned cases. |
| unsigned RoundedSize = NextPowerOf2(SizeInBits); |
| if (AlignInBits < RoundedSize) |
| return false; |
| |
| // Do not widen if it would introduce a slow unaligned load. |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| bool Fast = false; |
| return TLI->allowsMisalignedMemoryAccessesImpl( |
| RoundedSize, AddrSpace, Align(AlignInBits / 8), |
| MachineMemOperand::MOLoad, &Fast) && |
| Fast; |
| } |
| |
| static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, |
| unsigned Opcode) { |
| if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) |
| return false; |
| |
| return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, |
| Query.MMODescrs[0].AlignInBits, |
| Query.Types[1].getAddressSpace(), Opcode); |
| } |
| |
| AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, |
| const GCNTargetMachine &TM) |
| : ST(ST_) { |
| using namespace TargetOpcode; |
| |
| auto GetAddrSpacePtr = [&TM](unsigned AS) { |
| return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); |
| }; |
| |
| const LLT S1 = LLT::scalar(1); |
| const LLT S8 = LLT::scalar(8); |
| const LLT S16 = LLT::scalar(16); |
| const LLT S32 = LLT::scalar(32); |
| const LLT S64 = LLT::scalar(64); |
| const LLT S128 = LLT::scalar(128); |
| const LLT S256 = LLT::scalar(256); |
| const LLT S512 = LLT::scalar(512); |
| const LLT MaxScalar = LLT::scalar(MaxRegisterSize); |
| |
| const LLT V2S8 = LLT::fixed_vector(2, 8); |
| const LLT V2S16 = LLT::fixed_vector(2, 16); |
| const LLT V4S16 = LLT::fixed_vector(4, 16); |
| |
| const LLT V2S32 = LLT::fixed_vector(2, 32); |
| const LLT V3S32 = LLT::fixed_vector(3, 32); |
| const LLT V4S32 = LLT::fixed_vector(4, 32); |
| const LLT V5S32 = LLT::fixed_vector(5, 32); |
| const LLT V6S32 = LLT::fixed_vector(6, 32); |
| const LLT V7S32 = LLT::fixed_vector(7, 32); |
| const LLT V8S32 = LLT::fixed_vector(8, 32); |
| const LLT V9S32 = LLT::fixed_vector(9, 32); |
| const LLT V10S32 = LLT::fixed_vector(10, 32); |
| const LLT V11S32 = LLT::fixed_vector(11, 32); |
| const LLT V12S32 = LLT::fixed_vector(12, 32); |
| const LLT V13S32 = LLT::fixed_vector(13, 32); |
| const LLT V14S32 = LLT::fixed_vector(14, 32); |
| const LLT V15S32 = LLT::fixed_vector(15, 32); |
| const LLT V16S32 = LLT::fixed_vector(16, 32); |
| const LLT V32S32 = LLT::fixed_vector(32, 32); |
| |
| const LLT V2S64 = LLT::fixed_vector(2, 64); |
| const LLT V3S64 = LLT::fixed_vector(3, 64); |
| const LLT V4S64 = LLT::fixed_vector(4, 64); |
| const LLT V5S64 = LLT::fixed_vector(5, 64); |
| const LLT V6S64 = LLT::fixed_vector(6, 64); |
| const LLT V7S64 = LLT::fixed_vector(7, 64); |
| const LLT V8S64 = LLT::fixed_vector(8, 64); |
| const LLT V16S64 = LLT::fixed_vector(16, 64); |
| |
| std::initializer_list<LLT> AllS32Vectors = |
| {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, |
| V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; |
| std::initializer_list<LLT> AllS64Vectors = |
| {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; |
| |
| const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); |
| const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); |
| const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); |
| const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); |
| const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); |
| const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); |
| const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); |
| |
| const LLT CodePtr = FlatPtr; |
| |
| const std::initializer_list<LLT> AddrSpaces64 = { |
| GlobalPtr, ConstantPtr, FlatPtr |
| }; |
| |
| const std::initializer_list<LLT> AddrSpaces32 = { |
| LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr |
| }; |
| |
| const std::initializer_list<LLT> FPTypesBase = { |
| S32, S64 |
| }; |
| |
| const std::initializer_list<LLT> FPTypes16 = { |
| S32, S64, S16 |
| }; |
| |
| const std::initializer_list<LLT> FPTypesPK16 = { |
| S32, S64, S16, V2S16 |
| }; |
| |
| const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; |
| |
| // s1 for VCC branches, s32 for SCC branches. |
| getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); |
| |
| // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more |
| // elements for v3s16 |
| getActionDefinitionsBuilder(G_PHI) |
| .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) |
| .legalFor(AllS32Vectors) |
| .legalFor(AllS64Vectors) |
| .legalFor(AddrSpaces64) |
| .legalFor(AddrSpaces32) |
| .legalIf(isPointer(0)) |
| .clampScalar(0, S16, S256) |
| .widenScalarToNextPow2(0, 32) |
| .clampMaxNumElements(0, S32, 16) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .scalarize(0); |
| |
| if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { |
| // Full set of gfx9 features. |
| getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
| .legalFor({S32, S16, V2S16}) |
| .minScalar(0, S16) |
| .clampMaxNumElements(0, S16, 2) |
| .widenScalarToNextMultipleOf(0, 32) |
| .maxScalar(0, S32) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) |
| .legalFor({S32, S16, V2S16}) // Clamp modifier |
| .minScalarOrElt(0, S16) |
| .clampMaxNumElements(0, S16, 2) |
| .scalarize(0) |
| .widenScalarToNextPow2(0, 32) |
| .lower(); |
| } else if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
| .legalFor({S32, S16}) |
| .minScalar(0, S16) |
| .widenScalarToNextMultipleOf(0, 32) |
| .maxScalar(0, S32) |
| .scalarize(0); |
| |
| // Technically the saturating operations require clamp bit support, but this |
| // was introduced at the same time as 16-bit operations. |
| getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
| .legalFor({S32, S16}) // Clamp modifier |
| .minScalar(0, S16) |
| .scalarize(0) |
| .widenScalarToNextPow2(0, 16) |
| .lower(); |
| |
| // We're just lowering this, but it helps get a better result to try to |
| // coerce to the desired type first. |
| getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) |
| .minScalar(0, S16) |
| .scalarize(0) |
| .lower(); |
| } else { |
| getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
| .legalFor({S32}) |
| .widenScalarToNextMultipleOf(0, 32) |
| .clampScalar(0, S32, S32) |
| .scalarize(0); |
| |
| if (ST.hasIntClamp()) { |
| getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
| .legalFor({S32}) // Clamp modifier. |
| .scalarize(0) |
| .minScalarOrElt(0, S32) |
| .lower(); |
| } else { |
| // Clamp bit support was added in VI, along with 16-bit operations. |
| getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
| .minScalar(0, S32) |
| .scalarize(0) |
| .lower(); |
| } |
| |
| // FIXME: DAG expansion gets better results. The widening uses the smaller |
| // range values and goes for the min/max lowering directly. |
| getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) |
| .minScalar(0, S32) |
| .scalarize(0) |
| .lower(); |
| } |
| |
| getActionDefinitionsBuilder( |
| {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) |
| .customFor({S32, S64}) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(0, 32) |
| .scalarize(0); |
| |
| auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) |
| .legalFor({S32}) |
| .maxScalar(0, S32); |
| |
| if (ST.hasVOP3PInsts()) { |
| Mulh |
| .clampMaxNumElements(0, S8, 2) |
| .lowerFor({V2S8}); |
| } |
| |
| Mulh |
| .scalarize(0) |
| .lower(); |
| |
| // Report legal for any types we can handle anywhere. For the cases only legal |
| // on the SALU, RegBankSelect will be able to re-legalize. |
| getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) |
| .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) |
| .clampScalar(0, S32, S64) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) |
| .widenScalarToNextPow2(0) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder({G_UADDO, G_USUBO, |
| G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) |
| .legalFor({{S32, S1}, {S32, S32}}) |
| .minScalar(0, S32) |
| // TODO: .scalarize(0) |
| .lower(); |
| |
| getActionDefinitionsBuilder(G_BITCAST) |
| // Don't worry about the size constraint. |
| .legalIf(all(isRegisterType(0), isRegisterType(1))) |
| .lower(); |
| |
| |
| getActionDefinitionsBuilder(G_CONSTANT) |
| .legalFor({S1, S32, S64, S16, GlobalPtr, |
| LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) |
| .legalIf(isPointer(0)) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(0); |
| |
| getActionDefinitionsBuilder(G_FCONSTANT) |
| .legalFor({S32, S64, S16}) |
| .clampScalar(0, S16, S64); |
| |
| getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) |
| .legalIf(isRegisterType(0)) |
| // s1 and s16 are special cases because they have legal operations on |
| // them, but don't really occupy registers in the normal way. |
| .legalFor({S1, S16}) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampScalarOrElt(0, S32, MaxScalar) |
| .widenScalarToNextPow2(0, 32) |
| .clampMaxNumElements(0, S32, 16); |
| |
| getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); |
| |
| // If the amount is divergent, we have to do a wave reduction to get the |
| // maximum value, so this is expanded during RegBankSelect. |
| getActionDefinitionsBuilder(G_DYN_STACKALLOC) |
| .legalFor({{PrivatePtr, S32}}); |
| |
| getActionDefinitionsBuilder(G_GLOBAL_VALUE) |
| .customIf(typeIsNot(0, PrivatePtr)); |
| |
| getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); |
| |
| auto &FPOpActions = getActionDefinitionsBuilder( |
| { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) |
| .legalFor({S32, S64}); |
| auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) |
| .customFor({S32, S64}); |
| auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) |
| .customFor({S32, S64}); |
| |
| if (ST.has16BitInsts()) { |
| if (ST.hasVOP3PInsts()) |
| FPOpActions.legalFor({S16, V2S16}); |
| else |
| FPOpActions.legalFor({S16}); |
| |
| TrigActions.customFor({S16}); |
| FDIVActions.customFor({S16}); |
| } |
| |
| auto &MinNumMaxNum = getActionDefinitionsBuilder({ |
| G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); |
| |
| if (ST.hasVOP3PInsts()) { |
| MinNumMaxNum.customFor(FPTypesPK16) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampMaxNumElements(0, S16, 2) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else if (ST.has16BitInsts()) { |
| MinNumMaxNum.customFor(FPTypes16) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else { |
| MinNumMaxNum.customFor(FPTypesBase) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } |
| |
| if (ST.hasVOP3PInsts()) |
| FPOpActions.clampMaxNumElements(0, S16, 2); |
| |
| FPOpActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| TrigActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| FDIVActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| getActionDefinitionsBuilder({G_FNEG, G_FABS}) |
| .legalFor(FPTypesPK16) |
| .clampMaxNumElements(0, S16, 2) |
| .scalarize(0) |
| .clampScalar(0, S16, S64); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) |
| .legalFor({S32, S64, S16}) |
| .scalarize(0) |
| .clampScalar(0, S16, S64); |
| } else { |
| getActionDefinitionsBuilder(G_FSQRT) |
| .legalFor({S32, S64}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| |
| if (ST.hasFractBug()) { |
| getActionDefinitionsBuilder(G_FFLOOR) |
| .customFor({S64}) |
| .legalFor({S32, S64}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| } else { |
| getActionDefinitionsBuilder(G_FFLOOR) |
| .legalFor({S32, S64}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| } |
| } |
| |
| getActionDefinitionsBuilder(G_FPTRUNC) |
| .legalFor({{S32, S64}, {S16, S32}}) |
| .scalarize(0) |
| .lower(); |
| |
| getActionDefinitionsBuilder(G_FPEXT) |
| .legalFor({{S64, S32}, {S32, S16}}) |
| .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder(G_FSUB) |
| // Use actual fsub instruction |
| .legalFor({S32}) |
| // Must use fadd + fneg |
| .lowerFor({S64, S16, V2S16}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| |
| // Whether this is legal depends on the floating point mode for the function. |
| auto &FMad = getActionDefinitionsBuilder(G_FMAD); |
| if (ST.hasMadF16() && ST.hasMadMacF32Insts()) |
| FMad.customFor({S32, S16}); |
| else if (ST.hasMadMacF32Insts()) |
| FMad.customFor({S32}); |
| else if (ST.hasMadF16()) |
| FMad.customFor({S16}); |
| FMad.scalarize(0) |
| .lower(); |
| |
| auto &FRem = getActionDefinitionsBuilder(G_FREM); |
| if (ST.has16BitInsts()) { |
| FRem.customFor({S16, S32, S64}); |
| } else { |
| FRem.minScalar(0, S32) |
| .customFor({S32, S64}); |
| } |
| FRem.scalarize(0); |
| |
| // TODO: Do we need to clamp maximum bitwidth? |
| getActionDefinitionsBuilder(G_TRUNC) |
| .legalIf(isScalar(0)) |
| .legalFor({{V2S16, V2S32}}) |
| .clampMaxNumElements(0, S16, 2) |
| // Avoid scalarizing in cases that should be truly illegal. In unresolvable |
| // situations (like an invalid implicit use), we don't want to infinite loop |
| // in the legalizer. |
| .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) |
| .alwaysLegal(); |
| |
| getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) |
| .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, |
| {S32, S1}, {S64, S1}, {S16, S1}}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(1, 32); |
| |
| // TODO: Split s1->s64 during regbankselect for VALU. |
| auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) |
| .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) |
| .lowerIf(typeIs(1, S1)) |
| .customFor({{S32, S64}, {S64, S64}}); |
| if (ST.has16BitInsts()) |
| IToFP.legalFor({{S16, S16}}); |
| IToFP.clampScalar(1, S32, S64) |
| .minScalar(0, S32) |
| .scalarize(0) |
| .widenScalarToNextPow2(1); |
| |
| auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) |
| .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) |
| .customFor({{S64, S32}, {S64, S64}}) |
| .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); |
| if (ST.has16BitInsts()) |
| FPToI.legalFor({{S16, S16}}); |
| else |
| FPToI.minScalar(1, S32); |
| |
| FPToI.minScalar(0, S32) |
| .widenScalarToNextPow2(0, 32) |
| .scalarize(0) |
| .lower(); |
| |
| // Lower roundeven into G_FRINT |
| getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) |
| .scalarize(0) |
| .lower(); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S16, S32, S64}) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S32, S64}) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } else { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S32}) |
| .customFor({S64}) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } |
| |
| getActionDefinitionsBuilder(G_PTR_ADD) |
| .legalIf(all(isPointer(0), sameSize(0, 1))) |
| .scalarize(0) |
| .scalarSameSizeAs(1, 0); |
| |
| getActionDefinitionsBuilder(G_PTRMASK) |
| .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) |
| .scalarSameSizeAs(1, 0) |
| .scalarize(0); |
| |
| auto &CmpBuilder = |
| getActionDefinitionsBuilder(G_ICMP) |
| // The compare output type differs based on the register bank of the output, |
| // so make both s1 and s32 legal. |
| // |
| // Scalar compares producing output in scc will be promoted to s32, as that |
| // is the allocatable register type that will be needed for the copy from |
| // scc. This will be promoted during RegBankSelect, and we assume something |
| // before that won't try to use s32 result types. |
| // |
| // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg |
| // bank. |
| .legalForCartesianProduct( |
| {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) |
| .legalForCartesianProduct( |
| {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); |
| if (ST.has16BitInsts()) { |
| CmpBuilder.legalFor({{S1, S16}}); |
| } |
| |
| CmpBuilder |
| .widenScalarToNextPow2(1) |
| .clampScalar(1, S32, S64) |
| .scalarize(0) |
| .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); |
| |
| getActionDefinitionsBuilder(G_FCMP) |
| .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) |
| .widenScalarToNextPow2(1) |
| .clampScalar(1, S32, S64) |
| .scalarize(0); |
| |
| // FIXME: fpow has a selection pattern that should move to custom lowering. |
| auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); |
| if (ST.has16BitInsts()) |
| Exp2Ops.legalFor({S32, S16}); |
| else |
| Exp2Ops.legalFor({S32}); |
| Exp2Ops.clampScalar(0, MinScalarFPTy, S32); |
| Exp2Ops.scalarize(0); |
| |
| auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); |
| if (ST.has16BitInsts()) |
| ExpOps.customFor({{S32}, {S16}}); |
| else |
| ExpOps.customFor({S32}); |
| ExpOps.clampScalar(0, MinScalarFPTy, S32) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder(G_FPOWI) |
| .clampScalar(0, MinScalarFPTy, S32) |
| .lower(); |
| |
| // The 64-bit versions produce 32-bit results, but only on the SALU. |
| getActionDefinitionsBuilder(G_CTPOP) |
| .legalFor({{S32, S32}, {S32, S64}}) |
| .clampScalar(0, S32, S32) |
| .clampScalar(1, S32, S64) |
| .scalarize(0) |
| .widenScalarToNextPow2(0, 32) |
| .widenScalarToNextPow2(1, 32); |
| |
| // The hardware instructions return a different result on 0 than the generic |
| // instructions expect. The hardware produces -1, but these produce the |
| // bitwidth. |
| getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) |
| .scalarize(0) |
| .clampScalar(0, S32, S32) |
| .clampScalar(1, S32, S64) |
| .widenScalarToNextPow2(0, 32) |
| .widenScalarToNextPow2(1, 32) |
| .custom(); |
| |
| // The 64-bit versions produce 32-bit results, but only on the SALU. |
| getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) |
| .legalFor({{S32, S32}, {S32, S64}}) |
| .clampScalar(0, S32, S32) |
| .clampScalar(1, S32, S64) |
| .scalarize(0) |
| .widenScalarToNextPow2(0, 32) |
| .widenScalarToNextPow2(1, 32); |
| |
| // S64 is only legal on SALU, and needs to be broken into 32-bit elements in |
| // RegBankSelect. |
| getActionDefinitionsBuilder(G_BITREVERSE) |
| .legalFor({S32, S64}) |
| .clampScalar(0, S32, S64) |
| .scalarize(0) |
| .widenScalarToNextPow2(0); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder(G_BSWAP) |
| .legalFor({S16, S32, V2S16}) |
| .clampMaxNumElements(0, S16, 2) |
| // FIXME: Fixing non-power-of-2 before clamp is workaround for |
| // narrowScalar limitation. |
| .widenScalarToNextPow2(0) |
| .clampScalar(0, S16, S32) |
| .scalarize(0); |
| |
| if (ST.hasVOP3PInsts()) { |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
| .legalFor({S32, S16, V2S16}) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampMaxNumElements(0, S16, 2) |
| .minScalar(0, S16) |
| .widenScalarToNextPow2(0) |
| .scalarize(0) |
| .lower(); |
| } else { |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
| .legalFor({S32, S16}) |
| .widenScalarToNextPow2(0) |
| .minScalar(0, S16) |
| .scalarize(0) |
| .lower(); |
| } |
| } else { |
| // TODO: Should have same legality without v_perm_b32 |
| getActionDefinitionsBuilder(G_BSWAP) |
| .legalFor({S32}) |
| .lowerIf(scalarNarrowerThan(0, 32)) |
| // FIXME: Fixing non-power-of-2 before clamp is workaround for |
| // narrowScalar limitation. |
| .widenScalarToNextPow2(0) |
| .maxScalar(0, S32) |
| .scalarize(0) |
| .lower(); |
| |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
| .legalFor({S32}) |
| .minScalar(0, S32) |
| .widenScalarToNextPow2(0) |
| .scalarize(0) |
| .lower(); |
| } |
| |
| getActionDefinitionsBuilder(G_INTTOPTR) |
| // List the common cases |
| .legalForCartesianProduct(AddrSpaces64, {S64}) |
| .legalForCartesianProduct(AddrSpaces32, {S32}) |
| .scalarize(0) |
| // Accept any address space as long as the size matches |
| .legalIf(sameSize(0, 1)) |
| .widenScalarIf(smallerThan(1, 0), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
| }) |
| .narrowScalarIf(largerThan(1, 0), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
| }); |
| |
| getActionDefinitionsBuilder(G_PTRTOINT) |
| // List the common cases |
| .legalForCartesianProduct(AddrSpaces64, {S64}) |
| .legalForCartesianProduct(AddrSpaces32, {S32}) |
| .scalarize(0) |
| // Accept any address space as long as the size matches |
| .legalIf(sameSize(0, 1)) |
| .widenScalarIf(smallerThan(0, 1), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
| }) |
| .narrowScalarIf( |
| largerThan(0, 1), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
| }); |
| |
| getActionDefinitionsBuilder(G_ADDRSPACE_CAST) |
| .scalarize(0) |
| .custom(); |
| |
| const auto needToSplitMemOp = [=](const LegalityQuery &Query, |
| bool IsLoad) -> bool { |
| const LLT DstTy = Query.Types[0]; |
| |
| // Split vector extloads. |
| unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
| unsigned AlignBits = Query.MMODescrs[0].AlignInBits; |
| |
| if (MemSize < DstTy.getSizeInBits()) |
| MemSize = std::max(MemSize, AlignBits); |
| |
| if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) |
| return true; |
| |
| const LLT PtrTy = Query.Types[1]; |
| unsigned AS = PtrTy.getAddressSpace(); |
| if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) |
| return true; |
| |
| // Catch weird sized loads that don't evenly divide into the access sizes |
| // TODO: May be able to widen depending on alignment etc. |
| unsigned NumRegs = (MemSize + 31) / 32; |
| if (NumRegs == 3) { |
| if (!ST.hasDwordx3LoadStores()) |
| return true; |
| } else { |
| // If the alignment allows, these should have been widened. |
| if (!isPowerOf2_32(NumRegs)) |
| return true; |
| } |
| |
| if (AlignBits < MemSize) { |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, |
| Align(AlignBits / 8)); |
| } |
| |
| return false; |
| }; |
| |
| unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; |
| unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; |
| unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; |
| |
| // TODO: Refine based on subtargets which support unaligned access or 128-bit |
| // LDS |
| // TODO: Unsupported flat for SI. |
| |
| for (unsigned Op : {G_LOAD, G_STORE}) { |
| const bool IsStore = Op == G_STORE; |
| |
| auto &Actions = getActionDefinitionsBuilder(Op); |
| // Explicitly list some common cases. |
| // TODO: Does this help compile time at all? |
| Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, |
| {V2S32, GlobalPtr, V2S32, GlobalAlign32}, |
| {V4S32, GlobalPtr, V4S32, GlobalAlign32}, |
| {S64, GlobalPtr, S64, GlobalAlign32}, |
| {V2S64, GlobalPtr, V2S64, GlobalAlign32}, |
| {V2S16, GlobalPtr, V2S16, GlobalAlign32}, |
| {S32, GlobalPtr, S8, GlobalAlign8}, |
| {S32, GlobalPtr, S16, GlobalAlign16}, |
| |
| {S32, LocalPtr, S32, 32}, |
| {S64, LocalPtr, S64, 32}, |
| {V2S32, LocalPtr, V2S32, 32}, |
| {S32, LocalPtr, S8, 8}, |
| {S32, LocalPtr, S16, 16}, |
| {V2S16, LocalPtr, S32, 32}, |
| |
| {S32, PrivatePtr, S32, 32}, |
| {S32, PrivatePtr, S8, 8}, |
| {S32, PrivatePtr, S16, 16}, |
| {V2S16, PrivatePtr, S32, 32}, |
| |
| {S32, ConstantPtr, S32, GlobalAlign32}, |
| {V2S32, ConstantPtr, V2S32, GlobalAlign32}, |
| {V4S32, ConstantPtr, V4S32, GlobalAlign32}, |
| {S64, ConstantPtr, S64, GlobalAlign32}, |
| {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); |
| Actions.legalIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return isLoadStoreLegal(ST, Query); |
| }); |
| |
| // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to |
| // 64-bits. |
| // |
| // TODO: Should generalize bitcast action into coerce, which will also cover |
| // inserting addrspacecasts. |
| Actions.customIf(typeIs(1, Constant32Ptr)); |
| |
| // Turn any illegal element vectors into something easier to deal |
| // with. These will ultimately produce 32-bit scalar shifts to extract the |
| // parts anyway. |
| // |
| // For odd 16-bit element vectors, prefer to split those into pieces with |
| // 16-bit vector parts. |
| Actions.bitcastIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return shouldBitcastLoadStoreType(ST, Query.Types[0], |
| Query.MMODescrs[0].MemoryTy); |
| }, bitcastToRegisterType(0)); |
| |
| if (!IsStore) { |
| // Widen suitably aligned loads by loading extra bytes. The standard |
| // legalization actions can't properly express widening memory operands. |
| Actions.customIf([=](const LegalityQuery &Query) -> bool { |
| return shouldWidenLoad(ST, Query, G_LOAD); |
| }); |
| } |
| |
| // FIXME: load/store narrowing should be moved to lower action |
| Actions |
| .narrowScalarIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return !Query.Types[0].isVector() && |
| needToSplitMemOp(Query, Op == G_LOAD); |
| }, |
| [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
| const LLT DstTy = Query.Types[0]; |
| const LLT PtrTy = Query.Types[1]; |
| |
| const unsigned DstSize = DstTy.getSizeInBits(); |
| unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
| |
| // Split extloads. |
| if (DstSize > MemSize) |
| return std::make_pair(0, LLT::scalar(MemSize)); |
| |
| if (!isPowerOf2_32(DstSize)) { |
| // We're probably decomposing an odd sized store. Try to split |
| // to the widest type. TODO: Account for alignment. As-is it |
| // should be OK, since the new parts will be further legalized. |
| unsigned FloorSize = PowerOf2Floor(DstSize); |
| return std::make_pair(0, LLT::scalar(FloorSize)); |
| } |
| |
| if (DstSize > 32 && (DstSize % 32 != 0)) { |
| // FIXME: Need a way to specify non-extload of larger size if |
| // suitably aligned. |
| return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); |
| } |
| |
| unsigned MaxSize = maxSizeForAddrSpace(ST, |
| PtrTy.getAddressSpace(), |
| Op == G_LOAD); |
| if (MemSize > MaxSize) |
| return std::make_pair(0, LLT::scalar(MaxSize)); |
| |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| return std::make_pair(0, LLT::scalar(Align)); |
| }) |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return Query.Types[0].isVector() && |
| needToSplitMemOp(Query, Op == G_LOAD); |
| }, |
| [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
| const LLT DstTy = Query.Types[0]; |
| const LLT PtrTy = Query.Types[1]; |
| |
| LLT EltTy = DstTy.getElementType(); |
| unsigned MaxSize = maxSizeForAddrSpace(ST, |
| PtrTy.getAddressSpace(), |
| Op == G_LOAD); |
| |
| // FIXME: Handle widened to power of 2 results better. This ends |
| // up scalarizing. |
| // FIXME: 3 element stores scalarized on SI |
| |
| // Split if it's too large for the address space. |
| unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
| if (MemSize > MaxSize) { |
| unsigned NumElts = DstTy.getNumElements(); |
| unsigned EltSize = EltTy.getSizeInBits(); |
| |
| if (MaxSize % EltSize == 0) { |
| return std::make_pair( |
| 0, LLT::scalarOrVector( |
| ElementCount::getFixed(MaxSize / EltSize), EltTy)); |
| } |
| |
| unsigned NumPieces = MemSize / MaxSize; |
| |
| // FIXME: Refine when odd breakdowns handled |
| // The scalars will need to be re-legalized. |
| if (NumPieces == 1 || NumPieces >= NumElts || |
| NumElts % NumPieces != 0) |
| return std::make_pair(0, EltTy); |
| |
| return std::make_pair( |
| 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); |
| } |
| |
| // FIXME: We could probably handle weird extending loads better. |
| if (DstTy.getSizeInBits() > MemSize) |
| return std::make_pair(0, EltTy); |
| |
| unsigned EltSize = EltTy.getSizeInBits(); |
| unsigned DstSize = DstTy.getSizeInBits(); |
| if (!isPowerOf2_32(DstSize)) { |
| // We're probably decomposing an odd sized store. Try to split |
| // to the widest type. TODO: Account for alignment. As-is it |
| // should be OK, since the new parts will be further legalized. |
| unsigned FloorSize = PowerOf2Floor(DstSize); |
| return std::make_pair( |
| 0, LLT::scalarOrVector( |
| ElementCount::getFixed(FloorSize / EltSize), EltTy)); |
| } |
| |
| // Need to split because of alignment. |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| if (EltSize > Align && |
| (EltSize / Align < DstTy.getNumElements())) { |
| return std::make_pair( |
| 0, LLT::fixed_vector(EltSize / Align, EltTy)); |
| } |
| |
| // May need relegalization for the scalars. |
| return std::make_pair(0, EltTy); |
| }) |
| .minScalar(0, S32) |
| .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) |
| .widenScalarToNextPow2(0) |
| .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) |
| .lower(); |
| } |
| |
| // FIXME: Unaligned accesses not lowered. |
| auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) |
| .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, |
| {S32, GlobalPtr, S16, 2 * 8}, |
| {S32, LocalPtr, S8, 8}, |
| {S32, LocalPtr, S16, 16}, |
| {S32, PrivatePtr, S8, 8}, |
| {S32, PrivatePtr, S16, 16}, |
| {S32, ConstantPtr, S8, 8}, |
| {S32, ConstantPtr, S16, 2 * 8}}) |
| .legalIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return isLoadStoreLegal(ST, Query); |
| }); |
| |
| if (ST.hasFlatAddressSpace()) { |
| ExtLoads.legalForTypesWithMemDesc( |
| {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); |
| } |
| |
| // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to |
| // 64-bits. |
| // |
| // TODO: Should generalize bitcast action into coerce, which will also cover |
| // inserting addrspacecasts. |
| ExtLoads.customIf(typeIs(1, Constant32Ptr)); |
| |
| ExtLoads.clampScalar(0, S32, S32) |
| .widenScalarToNextPow2(0) |
| .lower(); |
| |
| auto &Atomics = getActionDefinitionsBuilder( |
| {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, |
| G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, |
| G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, |
| G_ATOMICRMW_UMIN}) |
| .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, |
| {S64, GlobalPtr}, {S64, LocalPtr}, |
| {S32, RegionPtr}, {S64, RegionPtr}}); |
| if (ST.hasFlatAddressSpace()) { |
| Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); |
| } |
| |
| auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); |
| if (ST.hasLDSFPAtomicAdd()) { |
| Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); |
| if (ST.hasGFX90AInsts()) |
| Atomic.legalFor({{S64, LocalPtr}}); |
| } |
| if (ST.hasAtomicFaddInsts()) |
| Atomic.legalFor({{S32, GlobalPtr}}); |
| |
| // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output |
| // demarshalling |
| getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) |
| .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, |
| {S32, FlatPtr}, {S64, FlatPtr}}) |
| .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, |
| {S32, RegionPtr}, {S64, RegionPtr}}); |
| // TODO: Pointer types, any 32-bit or 64-bit vector |
| |
| // Condition should be s32 for scalar, s1 for vector. |
| getActionDefinitionsBuilder(G_SELECT) |
| .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, |
| LocalPtr, FlatPtr, PrivatePtr, |
| LLT::fixed_vector(2, LocalPtr), |
| LLT::fixed_vector(2, PrivatePtr)}, |
| {S1, S32}) |
| .clampScalar(0, S16, S64) |
| .scalarize(1) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .fewerElementsIf(numElementsNotEven(0), scalarize(0)) |
| .clampMaxNumElements(0, S32, 2) |
| .clampMaxNumElements(0, LocalPtr, 2) |
| .clampMaxNumElements(0, PrivatePtr, 2) |
| .scalarize(0) |
| .widenScalarToNextPow2(0) |
| .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); |
| |
| // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can |
| // be more flexible with the shift amount type. |
| auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) |
| .legalFor({{S32, S32}, {S64, S32}}); |
| if (ST.has16BitInsts()) { |
| if (ST.hasVOP3PInsts()) { |
| Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) |
| .clampMaxNumElements(0, S16, 2); |
| } else |
| Shifts.legalFor({{S16, S16}}); |
| |
| // TODO: Support 16-bit shift amounts for all types |
| Shifts.widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a |
| // 32-bit amount. |
| const LLT ValTy = Query.Types[0]; |
| const LLT AmountTy = Query.Types[1]; |
| return ValTy.getSizeInBits() <= 16 && |
| AmountTy.getSizeInBits() < 16; |
| }, changeTo(1, S16)); |
| Shifts.maxScalarIf(typeIs(0, S16), 1, S16); |
| Shifts.clampScalar(1, S32, S32); |
| Shifts.clampScalar(0, S16, S64); |
| Shifts.widenScalarToNextPow2(0, 16); |
| |
| getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) |
| .minScalar(0, S16) |
| .scalarize(0) |
| .lower(); |
| } else { |
| // Make sure we legalize the shift amount type first, as the general |
| // expansion for the shifted type will produce much worse code if it hasn't |
| // been truncated already. |
| Shifts.clampScalar(1, S32, S32); |
| Shifts.clampScalar(0, S32, S64); |
| Shifts.widenScalarToNextPow2(0, 32); |
| |
| getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) |
| .minScalar(0, S32) |
| .scalarize(0) |
| .lower(); |
| } |
| Shifts.scalarize(0); |
| |
| for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { |
| unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; |
| unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; |
| unsigned IdxTypeIdx = 2; |
| |
| getActionDefinitionsBuilder(Op) |
| .customIf([=](const LegalityQuery &Query) { |
| const LLT EltTy = Query.Types[EltTypeIdx]; |
| const LLT VecTy = Query.Types[VecTypeIdx]; |
| const LLT IdxTy = Query.Types[IdxTypeIdx]; |
| const unsigned EltSize = EltTy.getSizeInBits(); |
| return (EltSize == 32 || EltSize == 64) && |
| VecTy.getSizeInBits() % 32 == 0 && |
| VecTy.getSizeInBits() <= MaxRegisterSize && |
| IdxTy.getSizeInBits() == 32; |
| }) |
| .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), |
| bitcastToVectorElement32(VecTypeIdx)) |
| //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) |
| .bitcastIf( |
| all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), |
| [=](const LegalityQuery &Query) { |
| // For > 64-bit element types, try to turn this into a 64-bit |
| // element vector since we may be able to do better indexing |
| // if this is scalar. If not, fall back to 32. |
| const LLT EltTy = Query.Types[EltTypeIdx]; |
| const LLT VecTy = Query.Types[VecTypeIdx]; |
| const unsigned DstEltSize = EltTy.getSizeInBits(); |
| const unsigned VecSize = VecTy.getSizeInBits(); |
| |
| const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; |
| return std::make_pair( |
| VecTypeIdx, |
| LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); |
| }) |
| .clampScalar(EltTypeIdx, S32, S64) |
| .clampScalar(VecTypeIdx, S32, S64) |
| .clampScalar(IdxTypeIdx, S32, S32) |
| .clampMaxNumElements(VecTypeIdx, S32, 32) |
| // TODO: Clamp elements for 64-bit vectors? |
| // It should only be necessary with variable indexes. |
| // As a last resort, lower to the stack |
| .lower(); |
| } |
| |
| getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) |
| .unsupportedIf([=](const LegalityQuery &Query) { |
| const LLT &EltTy = Query.Types[1].getElementType(); |
| return Query.Types[0] != EltTy; |
| }); |
| |
| for (unsigned Op : {G_EXTRACT, G_INSERT}) { |
| unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; |
| unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; |
| |
| // FIXME: Doesn't handle extract of illegal sizes. |
| getActionDefinitionsBuilder(Op) |
| .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) |
| // FIXME: Multiples of 16 should not be legal. |
| .legalIf([=](const LegalityQuery &Query) { |
| const LLT BigTy = Query.Types[BigTyIdx]; |
| const LLT LitTy = Query.Types[LitTyIdx]; |
| return (BigTy.getSizeInBits() % 32 == 0) && |
| (LitTy.getSizeInBits() % 16 == 0); |
| }) |
| .widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT BigTy = Query.Types[BigTyIdx]; |
| return (BigTy.getScalarSizeInBits() < 16); |
| }, |
| LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) |
| .widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT LitTy = Query.Types[LitTyIdx]; |
| return (LitTy.getScalarSizeInBits() < 16); |
| }, |
| LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) |
| .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
| .widenScalarToNextPow2(BigTyIdx, 32); |
| |
| } |
| |
| auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) |
| .legalForCartesianProduct(AllS32Vectors, {S32}) |
| .legalForCartesianProduct(AllS64Vectors, {S64}) |
| .clampNumElements(0, V16S32, V32S32) |
| .clampNumElements(0, V2S64, V16S64) |
| .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); |
| |
| if (ST.hasScalarPackInsts()) { |
| BuildVector |
| // FIXME: Should probably widen s1 vectors straight to s32 |
| .minScalarOrElt(0, S16) |
| // Widen source elements and produce a G_BUILD_VECTOR_TRUNC |
| .minScalar(1, S32); |
| |
| getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
| .legalFor({V2S16, S32}) |
| .lower(); |
| BuildVector.minScalarOrElt(0, S32); |
| } else { |
| BuildVector.customFor({V2S16, S16}); |
| BuildVector.minScalarOrElt(0, S32); |
| |
| getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
| .customFor({V2S16, S32}) |
| .lower(); |
| } |
| |
| BuildVector.legalIf(isRegisterType(0)); |
| |
| // FIXME: Clamp maximum size |
| getActionDefinitionsBuilder(G_CONCAT_VECTORS) |
| .legalIf(all(isRegisterType(0), isRegisterType(1))) |
| .clampMaxNumElements(0, S32, 32) |
| .clampMaxNumElements(1, S16, 2) // TODO: Make 4? |
| .clampMaxNumElements(0, S16, 64); |
| |
| // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse |
| // pre-legalize. |
| if (ST.hasVOP3PInsts()) { |
| getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) |
| .customFor({V2S16, V2S16}) |
| .lower(); |
| } else |
| getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); |
| |
| // Merge/Unmerge |
| for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { |
| unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; |
| unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; |
| |
| auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| if (Ty.isVector()) { |
| const LLT &EltTy = Ty.getElementType(); |
| if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) |
| return true; |
| if (!isPowerOf2_32(EltTy.getSizeInBits())) |
| return true; |
| } |
| return false; |
| }; |
| |
| auto &Builder = getActionDefinitionsBuilder(Op) |
| .legalIf(all(isRegisterType(0), isRegisterType(1))) |
| .lowerFor({{S16, V2S16}}) |
| .lowerIf([=](const LegalityQuery &Query) { |
| const LLT BigTy = Query.Types[BigTyIdx]; |
| return BigTy.getSizeInBits() == 32; |
| }) |
| // Try to widen to s16 first for small types. |
| // TODO: Only do this on targets with legal s16 shifts |
| .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) |
| .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) |
| .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
| .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), |
| elementTypeIs(1, S16)), |
| changeTo(1, V2S16)) |
| // Clamp the little scalar to s8-s256 and make it a power of 2. It's not |
| // worth considering the multiples of 64 since 2*192 and 2*384 are not |
| // valid. |
| .clampScalar(LitTyIdx, S32, S512) |
| .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) |
| // Break up vectors with weird elements into scalars |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, |
| scalarize(0)) |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, |
| scalarize(1)) |
| .clampScalar(BigTyIdx, S32, MaxScalar); |
| |
| if (Op == G_MERGE_VALUES) { |
| Builder.widenScalarIf( |
| // TODO: Use 16-bit shifts if legal for 8-bit values? |
| [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[LitTyIdx]; |
| return Ty.getSizeInBits() < 32; |
| }, |
| changeTo(LitTyIdx, S32)); |
| } |
| |
| Builder.widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[BigTyIdx]; |
| return !isPowerOf2_32(Ty.getSizeInBits()) && |
| Ty.getSizeInBits() % 16 != 0; |
| }, |
| [=](const LegalityQuery &Query) { |
| // Pick the next power of 2, or a multiple of 64 over 128. |
| // Whichever is smaller. |
| const LLT &Ty = Query.Types[BigTyIdx]; |
| unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); |
| if (NewSizeInBits >= 256) { |
| unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); |
| if (RoundedTo < NewSizeInBits) |
| NewSizeInBits = RoundedTo; |
| } |
| return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); |
| }) |
| // Any vectors left are the wrong size. Scalarize them. |
| .scalarize(0) |
| .scalarize(1); |
| } |
| |
| // S64 is only legal on SALU, and needs to be broken into 32-bit elements in |
| // RegBankSelect. |
| auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) |
| .legalFor({{S32}, {S64}}); |
| |
| if (ST.hasVOP3PInsts()) { |
| SextInReg.lowerFor({{V2S16}}) |
| // Prefer to reduce vector widths for 16-bit vectors before lowering, to |
| // get more vector shift opportunities, since we'll get those when |
| // expanded. |
| .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); |
| } else if (ST.has16BitInsts()) { |
| SextInReg.lowerFor({{S32}, {S64}, {S16}}); |
| } else { |
| // Prefer to promote to s32 before lowering if we don't have 16-bit |
| // shifts. This avoid a lot of intermediate truncate and extend operations. |
| SextInReg.lowerFor({{S32}, {S64}}); |
| } |
| |
| SextInReg |
| .scalarize(0) |
| .clampScalar(0, S32, S64) |
| .lower(); |
| |
| getActionDefinitionsBuilder({G_ROTR, G_ROTL}) |
| .scalarize(0) |
| .lower(); |
| |
| // TODO: Only Try to form v2s16 with legal packed instructions. |
| getActionDefinitionsBuilder(G_FSHR) |
| .legalFor({{S32, S32}}) |
| .lowerFor({{V2S16, V2S16}}) |
| .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) |
| .scalarize(0) |
| .lower(); |
| |
| if (ST.hasVOP3PInsts()) { |
| getActionDefinitionsBuilder(G_FSHL) |
| .lowerFor({{V2S16, V2S16}}) |
| .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) |
| .scalarize(0) |
| .lower(); |
| } else { |
| getActionDefinitionsBuilder(G_FSHL) |
| .scalarize(0) |
| .lower(); |
| } |
| |
| getActionDefinitionsBuilder(G_READCYCLECOUNTER) |
| .legalFor({S64}); |
| |
| getActionDefinitionsBuilder(G_FENCE) |
| .alwaysLegal(); |
| |
| getActionDefinitionsBuilder({G_SMULO, G_UMULO}) |
| .scalarize(0) |
| .minScalar(0, S32) |
| .lower(); |
| |
| getActionDefinitionsBuilder({G_SBFX, G_UBFX}) |
| .legalFor({{S32, S32}, {S64, S32}}) |
| .clampScalar(1, S32, S32) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(0) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder({ |
| // TODO: Verify V_BFI_B32 is generated from expanded bit ops |
| G_FCOPYSIGN, |
| |
| G_ATOMIC_CMPXCHG_WITH_SUCCESS, |
| G_ATOMICRMW_NAND, |
| G_ATOMICRMW_FSUB, |
| G_READ_REGISTER, |
| G_WRITE_REGISTER, |
| |
| G_SADDO, G_SSUBO, |
| |
| // TODO: Implement |
| G_FMINIMUM, G_FMAXIMUM}).lower(); |
| |
| getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) |
| .lower(); |
| |
| getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, |
| G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, |
| G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) |
| .unsupported(); |
| |
| getLegacyLegalizerInfo().computeTables(); |
| verify(*ST.getInstrInfo()); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, |
| MachineInstr &MI) const { |
| MachineIRBuilder &B = Helper.MIRBuilder; |
| MachineRegisterInfo &MRI = *B.getMRI(); |
| |
| switch (MI.getOpcode()) { |
| case TargetOpcode::G_ADDRSPACE_CAST: |
| return legalizeAddrSpaceCast(MI, MRI, B); |
| case TargetOpcode::G_FRINT: |
| return legalizeFrint(MI, MRI, B); |
| case TargetOpcode::G_FCEIL: |
| return legalizeFceil(MI, MRI, B); |
| case TargetOpcode::G_FREM: |
| return legalizeFrem(MI, MRI, B); |
| case TargetOpcode::G_INTRINSIC_TRUNC: |
| return legalizeIntrinsicTrunc(MI, MRI, B); |
| case TargetOpcode::G_SITOFP: |
| return legalizeITOFP(MI, MRI, B, true); |
| case TargetOpcode::G_UITOFP: |
| return legalizeITOFP(MI, MRI, B, false); |
| case TargetOpcode::G_FPTOSI: |
| return legalizeFPTOI(MI, MRI, B, true); |
| case TargetOpcode::G_FPTOUI: |
| return legalizeFPTOI(MI, MRI, B, false); |
| case TargetOpcode::G_FMINNUM: |
| case TargetOpcode::G_FMAXNUM: |
| case TargetOpcode::G_FMINNUM_IEEE: |
| case TargetOpcode::G_FMAXNUM_IEEE: |
| return legalizeMinNumMaxNum(Helper, MI); |
| case TargetOpcode::G_EXTRACT_VECTOR_ELT: |
| return legalizeExtractVectorElt(MI, MRI, B); |
| case TargetOpcode::G_INSERT_VECTOR_ELT: |
| return legalizeInsertVectorElt(MI, MRI, B); |
| case TargetOpcode::G_SHUFFLE_VECTOR: |
| return legalizeShuffleVector(MI, MRI, B); |
| case TargetOpcode::G_FSIN: |
| case TargetOpcode::G_FCOS: |
| return legalizeSinCos(MI, MRI, B); |
| case TargetOpcode::G_GLOBAL_VALUE: |
| return legalizeGlobalValue(MI, MRI, B); |
| case TargetOpcode::G_LOAD: |
| case TargetOpcode::G_SEXTLOAD: |
| case TargetOpcode::G_ZEXTLOAD: |
| return legalizeLoad(Helper, MI); |
| case TargetOpcode::G_FMAD: |
| return legalizeFMad(MI, MRI, B); |
| case TargetOpcode::G_FDIV: |
| return legalizeFDIV(MI, MRI, B); |
| case TargetOpcode::G_UDIV: |
| case TargetOpcode::G_UREM: |
| case TargetOpcode::G_UDIVREM: |
| return legalizeUnsignedDIV_REM(MI, MRI, B); |
| case TargetOpcode::G_SDIV: |
| case TargetOpcode::G_SREM: |
| case TargetOpcode::G_SDIVREM: |
| return legalizeSignedDIV_REM(MI, MRI, B); |
| case TargetOpcode::G_ATOMIC_CMPXCHG: |
| return legalizeAtomicCmpXChg(MI, MRI, B); |
| case TargetOpcode::G_FLOG: |
| return legalizeFlog(MI, B, numbers::ln2f); |
| case TargetOpcode::G_FLOG10: |
| return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); |
| case TargetOpcode::G_FEXP: |
| return legalizeFExp(MI, B); |
| case TargetOpcode::G_FPOW: |
| return legalizeFPow(MI, B); |
| case TargetOpcode::G_FFLOOR: |
| return legalizeFFloor(MI, MRI, B); |
| case TargetOpcode::G_BUILD_VECTOR: |
| return legalizeBuildVector(MI, MRI, B); |
| case TargetOpcode::G_CTLZ: |
| case TargetOpcode::G_CTTZ: |
| return legalizeCTLZ_CTTZ(MI, MRI, B); |
| default: |
| return false; |
| } |
| |
| llvm_unreachable("expected switch to return"); |
| } |
| |
| Register AMDGPULegalizerInfo::getSegmentAperture( |
| unsigned AS, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| MachineFunction &MF = B.getMF(); |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| const LLT S32 = LLT::scalar(32); |
| |
| assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); |
| |
| if (ST.hasApertureRegs()) { |
| // FIXME: Use inline constants (src_{shared, private}_base) instead of |
| // getreg. |
| unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? |
| AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : |
| AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; |
| unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? |
| AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : |
| AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; |
| unsigned Encoding = |
| AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | |
| Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | |
| WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; |
| |
| Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
| |
| B.buildInstr(AMDGPU::S_GETREG_B32) |
| .addDef(GetReg) |
| .addImm(Encoding); |
| MRI.setType(GetReg, S32); |
| |
| auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); |
| return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); |
| } |
| |
| Register QueuePtr = MRI.createGenericVirtualRegister( |
| LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); |
| |
| if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) |
| return Register(); |
| |
| // Offset into amd_queue_t for group_segment_aperture_base_hi / |
| // private_segment_aperture_base_hi. |
| uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; |
| |
| // TODO: can we be smarter about machine pointer info? |
| MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
| MachineMemOperand *MMO = MF.getMachineMemOperand( |
| PtrInfo, |
| MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant, |
| LLT::scalar(32), commonAlignment(Align(64), StructOffset)); |
| |
| Register LoadAddr; |
| |
| B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); |
| return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| MachineFunction &MF = B.getMF(); |
| |
| const LLT S32 = LLT::scalar(32); |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| LLT DstTy = MRI.getType(Dst); |
| LLT SrcTy = MRI.getType(Src); |
| unsigned DestAS = DstTy.getAddressSpace(); |
| unsigned SrcAS = SrcTy.getAddressSpace(); |
| |
| // TODO: Avoid reloading from the queue ptr for each cast, or at least each |
| // vector element. |
| assert(!DstTy.isVector()); |
| |
| const AMDGPUTargetMachine &TM |
| = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); |
| |
| if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { |
| MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); |
| return true; |
| } |
| |
| if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
| // Truncate. |
| B.buildExtract(Dst, Src, 0); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
| const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| uint32_t AddrHiVal = Info->get32BitAddressHighBits(); |
| |
| // FIXME: This is a bit ugly due to creating a merge of 2 pointers to |
| // another. Merge operands are required to be the same type, but creating an |
| // extra ptrtoint would be kind of pointless. |
| auto HighAddr = B.buildConstant( |
| LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); |
| B.buildMerge(Dst, {Src, HighAddr}); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { |
| assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || |
| DestAS == AMDGPUAS::PRIVATE_ADDRESS); |
| unsigned NullVal = TM.getNullPointerValue(DestAS); |
| |
| auto SegmentNull = B.buildConstant(DstTy, NullVal); |
| auto FlatNull = B.buildConstant(SrcTy, 0); |
| |
| // Extract low 32-bits of the pointer. |
| auto PtrLo32 = B.buildExtract(DstTy, Src, 0); |
| |
| auto CmpRes = |
| B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); |
| B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) |
| return false; |
| |
| if (!ST.hasFlatAddressSpace()) |
| return false; |
| |
| auto SegmentNull = |
| B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); |
| auto FlatNull = |
| B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); |
| |
| Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); |
| if (!ApertureReg.isValid()) |
| return false; |
| |
| auto CmpRes = |
| B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); |
| |
| // Coerce the type of the low half of the result so we can use merge_values. |
| Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); |
| |
| // TODO: Should we allow mismatched types but matching sizes in merges to |
| // avoid the ptrtoint? |
| auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); |
| B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFrint( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| Register Src = MI.getOperand(1).getReg(); |
| LLT Ty = MRI.getType(Src); |
| assert(Ty.isScalar() && Ty.getSizeInBits() == 64); |
| |
| APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); |
| APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); |
| |
| auto C1 = B.buildFConstant(Ty, C1Val); |
| auto CopySign = B.buildFCopysign(Ty, C1, Src); |
| |
| // TODO: Should this propagate fast-math-flags? |
| auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); |
| auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); |
| |
| auto C2 = B.buildFConstant(Ty, C2Val); |
| auto Fabs = B.buildFAbs(Ty, Src); |
| |
| auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); |
| B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFceil( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| |
| const LLT S1 = LLT::scalar(1); |
| const LLT S64 = LLT::scalar(64); |
| |
| Register Src = MI.getOperand(1).getReg(); |
| assert(MRI.getType(Src) == S64); |
| |
| // result = trunc(src) |
| // if (src > 0.0 && src != result) |
| // result += 1.0 |
| |
| auto Trunc = B.buildIntrinsicTrunc(S64, Src); |
| |
| const auto Zero = B.buildFConstant(S64, 0.0); |
| const auto One = B.buildFConstant(S64, 1.0); |
| auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); |
| auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); |
| auto And = B.buildAnd(S1, Lt0, NeTrunc); |
| auto Add = B.buildSelect(S64, And, One, Zero); |
| |
| // TODO: Should this propagate fast-math-flags? |
| B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFrem( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| Register DstReg = MI.getOperand(0).getReg(); |
| Register Src0Reg = MI.getOperand(1).getReg(); |
| Register Src1Reg = MI.getOperand(2).getReg(); |
| auto Flags = MI.getFlags(); |
| LLT Ty = MRI.getType(DstReg); |
| |
| auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); |
| auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); |
| auto Neg = B.buildFNeg(Ty, Trunc, Flags); |
| B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| static MachineInstrBuilder extractF64Exponent(Register Hi, |
| MachineIRBuilder &B) { |
| const unsigned FractBits = 52; |
| const unsigned ExpBits = 11; |
| LLT S32 = LLT::scalar(32); |
| |
| auto Const0 = B.buildConstant(S32, FractBits - 32); |
| auto Const1 = B.buildConstant(S32, ExpBits); |
| |
| auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) |
| .addUse(Hi) |
| .addUse(Const0.getReg(0)) |
| .addUse(Const1.getReg(0)); |
| |
| return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| const LLT S1 = LLT::scalar(1); |
| const LLT S32 = LLT::scalar(32); |
| const LLT S64 = LLT::scalar(64); |
| |
| Register Src = MI.getOperand(1).getReg(); |
| assert(MRI.getType(Src) == S64); |
| |
| // TODO: Should this use extract since the low half is unused? |
| auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
| Register Hi = Unmerge.getReg(1); |
| |
| // Extract the upper half, since this is where we will find the sign and |
| // exponent. |
| auto Exp = extractF64Exponent(Hi, B); |
| |
| const unsigned FractBits = 52; |
| |
| // Extract the sign bit. |
| const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); |
| auto SignBit = B.buildAnd(S32, Hi, SignBitMask); |
| |
| const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); |
| |
| const auto Zero32 = B.buildConstant(S32, 0); |
| |
| // Extend back to 64-bits. |
| auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); |
| |
| auto Shr = B.buildAShr(S64, FractMask, Exp); |
| auto Not = B.buildNot(S64, Shr); |
| auto Tmp0 = B.buildAnd(S64, Src, Not); |
| auto FiftyOne = B.buildConstant(S32, FractBits - 1); |
| |
| auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); |
| auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); |
| |
| auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); |
| B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeITOFP( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, bool Signed) const { |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| const LLT S64 = LLT::scalar(64); |
| const LLT S32 = LLT::scalar(32); |
| |
| assert(MRI.getType(Src) == S64); |
| |
| auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
| auto ThirtyTwo = B.buildConstant(S32, 32); |
| |
| if (MRI.getType(Dst) == S64) { |
| auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) |
| : B.buildUITOFP(S64, Unmerge.getReg(1)); |
| |
| auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); |
| auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) |
| .addUse(CvtHi.getReg(0)) |
| .addUse(ThirtyTwo.getReg(0)); |
| |
| // TODO: Should this propagate fast-math-flags? |
| B.buildFAdd(Dst, LdExp, CvtLo); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| assert(MRI.getType(Dst) == S32); |
| |
| auto One = B.buildConstant(S32, 1); |
| |
| MachineInstrBuilder ShAmt; |
| if (Signed) { |
| auto ThirtyOne = B.buildConstant(S32, 31); |
| auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); |
| auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); |
| auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); |
| auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, |
| /*HasSideEffects=*/false) |
| .addUse(Unmerge.getReg(1)); |
| auto LS2 = B.buildSub(S32, LS, One); |
| ShAmt = B.buildUMin(S32, LS2, MaxShAmt); |
| } else |
| ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); |
| auto Norm = B.buildShl(S64, Src, ShAmt); |
| auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); |
| auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); |
| auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); |
| auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); |
| auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); |
| B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, |
| /*HasSideEffects=*/false) |
| .addUse(FVal.getReg(0)) |
| .addUse(Scale.getReg(0)); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| // TODO: Copied from DAG implementation. Verify logic and document how this |
| // actually works. |
| bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, |
| bool Signed) const { |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| const LLT S64 = LLT::scalar(64); |
| const LLT S32 = LLT::scalar(32); |
| |
| const LLT SrcLT = MRI.getType(Src); |
| assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); |
| |
| unsigned Flags = MI.getFlags(); |
| |
| // The basic idea of converting a floating point number into a pair of 32-bit |
| // integers is illustrated as follows: |
| // |
| // tf := trunc(val); |
| // hif := floor(tf * 2^-32); |
| // lof := tf - hif * 2^32; // lof is always positive due to floor. |
| // hi := fptoi(hif); |
| // lo := fptoi(lof); |
| // |
| auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); |
| MachineInstrBuilder Sign; |
| if (Signed && SrcLT == S32) { |
| // However, a 32-bit floating point number has only 23 bits mantissa and |
| // it's not enough to hold all the significant bits of `lof` if val is |
| // negative. To avoid the loss of precision, We need to take the absolute |
| // value after truncating and flip the result back based on the original |
| // signedness. |
| Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); |
| Trunc = B.buildFAbs(S32, Trunc, Flags); |
| } |
| MachineInstrBuilder K0, K1; |
| if (SrcLT == S64) { |
| K0 = B.buildFConstant(S64, |
| BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); |
| K1 = B.buildFConstant(S64, |
| BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); |
| } else { |
| K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); |
| K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); |
| } |
| |
| auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); |
| auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); |
| auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); |
| |
| auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) |
| : B.buildFPTOUI(S32, FloorMul); |
| auto Lo = B.buildFPTOUI(S32, Fma); |
| |
| if (Signed && SrcLT == S32) { |
| // Flip the result based on the signedness, which is either all 0s or 1s. |
| Sign = B.buildMerge(S64, {Sign, Sign}); |
| // r := xor({lo, hi}, sign) - sign; |
| B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); |
| } else |
| B.buildMerge(Dst, {Lo, Hi}); |
| MI.eraseFromParent(); |
| |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, |
| MachineInstr &MI) const { |
| MachineFunction &MF = Helper.MIRBuilder.getMF(); |
| const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || |
| MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; |
| |
| // With ieee_mode disabled, the instructions have the correct behavior |
| // already for G_FMINNUM/G_FMAXNUM |
| if (!MFI->getMode().IEEE) |
| return !IsIEEEOp; |
| |
| if (IsIEEEOp) |
| return true; |
| |
| return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeExtractVectorElt( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| // TODO: Should move some of this into LegalizerHelper. |
| |
| // TODO: Promote dynamic indexing of s16 to s32 |
| |
| // FIXME: Artifact combiner probably should have replaced the truncated |
| // constant before this, so we shouldn't need |
| // getIConstantVRegValWithLookThrough. |
| Optional<ValueAndVReg> MaybeIdxVal = |
| getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); |
| if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. |
| return true; |
| const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Vec = MI.getOperand(1).getReg(); |
| |
| LLT VecTy = MRI.getType(Vec); |
| LLT EltTy = VecTy.getElementType(); |
| assert(EltTy == MRI.getType(Dst)); |
| |
| if (IdxVal < VecTy.getNumElements()) |
| B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); |
| else |
| B.buildUndef(Dst); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeInsertVectorElt( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| // TODO: Should move some of this into LegalizerHelper. |
| |
| // TODO: Promote dynamic indexing of s16 to s32 |
| |
| // FIXME: Artifact combiner probably should have replaced the truncated |
| // constant before this, so we shouldn't need |
| // getIConstantVRegValWithLookThrough. |
| Optional<ValueAndVReg> MaybeIdxVal = |
| getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); |
| if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. |
| return true; |
| |
| int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Vec = MI.getOperand(1).getReg(); |
| Register Ins = MI.getOperand(2).getReg(); |
| |
| LLT VecTy = MRI.getType(Vec); |
| LLT EltTy = VecTy.getElementType(); |
| assert(EltTy == MRI.getType(Ins)); |
| |
| if (IdxVal < VecTy.getNumElements()) |
| B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); |
| else |
| B.buildUndef(Dst); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeShuffleVector( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| const LLT V2S16 = LLT::fixed_vector(2, 16); |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src0 = MI.getOperand(1).getReg(); |
| LLT DstTy = MRI.getType(Dst); |
| LLT SrcTy = MRI.getType(Src0); |
| |
| if (SrcTy == V2S16 && DstTy == V2S16 && |
| AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) |
| return true; |
| |
| MachineIRBuilder HelperBuilder(MI); |
| GISelObserverWrapper DummyObserver; |
| LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); |
| return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeSinCos( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| |
| Register DstReg = MI.getOperand(0).getReg(); |
| Register SrcReg = MI.getOperand(1).getReg(); |
| LLT Ty = MRI.getType(DstReg); |
| unsigned Flags = MI.getFlags(); |
| |
| Register TrigVal; |
| auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); |
| if (ST.hasTrigReducedRange()) { |
| auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); |
| TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) |
| .addUse(MulVal.getReg(0)) |
| .setMIFlags(Flags).getReg(0); |
| } else |
| TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); |
| |
| Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? |
| Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; |
| B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) |
| .addUse(TrigVal) |
| .setMIFlags(Flags); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, |
| MachineIRBuilder &B, |
| const GlobalValue *GV, |
| int64_t Offset, |
| unsigned GAFlags) const { |
| assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); |
| // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered |
| // to the following code sequence: |
| // |
| // For constant address space: |
| // s_getpc_b64 s[0:1] |
| // s_add_u32 s0, s0, $symbol |
| // s_addc_u32 s1, s1, 0 |
| // |
| // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
| // a fixup or relocation is emitted to replace $symbol with a literal |
| // constant, which is a pc-relative offset from the encoding of the $symbol |
| // operand to the global variable. |
| // |
| // For global address space: |
| // s_getpc_b64 s[0:1] |
| // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo |
| // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi |
| // |
| // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
| // fixups or relocations are emitted to replace $symbol@*@lo and |
| // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, |
| // which is a 64-bit pc-relative offset from the encoding of the $symbol |
| // operand to the global variable. |
| // |
| // What we want here is an offset from the value returned by s_getpc |
| // (which is the address of the s_add_u32 instruction) to the global |
| // variable, but since the encoding of $symbol starts 4 bytes after the start |
| // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too |
| // small. This requires us to add 4 to the global variable offset in order to |
| // compute the correct address. Similarly for the s_addc_u32 instruction, the |
| // encoding of $symbol starts 12 bytes after the start of the s_add_u32 |
| // instruction. |
| |
| LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| |
| Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : |
| B.getMRI()->createGenericVirtualRegister(ConstPtrTy); |
| |
| MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) |
| .addDef(PCReg); |
| |
| MIB.addGlobalAddress(GV, Offset + 4, GAFlags); |
| if (GAFlags == SIInstrInfo::MO_NONE) |
| MIB.addImm(0); |
| else |
| MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); |
| |
| B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); |
| |
| if (PtrTy.getSizeInBits() == 32) |
| B.buildExtract(DstReg, PCReg, 0); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeGlobalValue( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| Register DstReg = MI.getOperand(0).getReg(); |
| LLT Ty = MRI.getType(DstReg); |
| unsigned AS = Ty.getAddressSpace(); |
| |
| const GlobalValue *GV = MI.getOperand(1).getGlobal(); |
| MachineFunction &MF = B.getMF(); |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
| if (!MFI->isModuleEntryFunction() && |
| !GV->getName().equals("llvm.amdgcn.module.lds")) { |
| const Function &Fn = MF.getFunction(); |
| DiagnosticInfoUnsupported BadLDSDecl( |
| Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), |
| DS_Warning); |
| Fn.getContext().diagnose(BadLDSDecl); |
| |
| // We currently don't have a way to correctly allocate LDS objects that |
| // aren't directly associated with a kernel. We do force inlining of |
| // functions that use local objects. However, if these dead functions are |
| // not eliminated, we don't want a compile time error. Just emit a warning |
| // and a trap, since there should be no callable path here. |
| B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); |
| B.buildUndef(DstReg); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| // TODO: We could emit code to handle the initialization somewhere. |
| // We ignore the initializer for now and legalize it to allow selection. |
| // The initializer will anyway get errored out during assembly emission. |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| if (!TLI->shouldUseLDSConstAddress(GV)) { |
| MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); |
| return true; // Leave in place; |
| } |
| |
| if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { |
| Type *Ty = GV->getValueType(); |
| // HIP uses an unsized array `extern __shared__ T s[]` or similar |
| // zero-sized type in other languages to declare the dynamic shared |
| // memory which size is not known at the compile time. They will be |
| // allocated by the runtime and placed directly after the static |
| // allocated ones. They all share the same offset. |
| if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { |
| // Adjust alignment for that dynamic shared memory array. |
| MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); |
| LLT S32 = LLT::scalar(32); |
| auto Sz = |
| B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); |
| B.buildIntToPtr(DstReg, Sz); |
| MI.eraseFromParent(); |
| return true; |
| } |
| } |
| |
| B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), |
| *cast<GlobalVariable>(GV))); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| |
| if (TLI->shouldEmitFixup(GV)) { |
| buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (TLI->shouldEmitPCReloc(GV)) { |
| buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); |
| |
| LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; |
| MachineMemOperand *GOTMMO = MF.getMachineMemOperand( |
| MachinePointerInfo::getGOT(MF), |
| MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant, |
| LoadTy, Align(8)); |
| |
| buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); |
| |
| if (Ty.getSizeInBits() == 32) { |
| // Truncate if this is a 32-bit constant address. |
| auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); |
| B.buildExtract(DstReg, Load, 0); |
| } else |
| B.buildLoad(DstReg, GOTAddr, *GOTMMO); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| static LLT widenToNextPowerOf2(LLT Ty) { |
| if (Ty.isVector()) |
| return Ty.changeElementCount( |
| ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); |
| return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, |
| MachineInstr &MI) const { |
| MachineIRBuilder &B = Helper.MIRBuilder; |
| MachineRegisterInfo &MRI = *B.getMRI(); |
| GISelChangeObserver &Observer = Helper.Observer; |
| |
| Register PtrReg = MI.getOperand(1).getReg(); |
| LLT PtrTy = MRI.getType(PtrReg); |
| unsigned AddrSpace = PtrTy.getAddressSpace(); |
| |
| if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
| LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); |
| Observer.changingInstr(MI); |
| MI.getOperand(1).setReg(Cast.getReg(0)); |
| Observer.changedInstr(MI); |
| return true; |
| } |
| |
| if (MI.getOpcode() != AMDGPU::G_LOAD) |
| return false; |
| |
| Register ValReg = MI.getOperand(0).getReg(); |
| LLT ValTy = MRI.getType(ValReg); |
| |
| MachineMemOperand *MMO = *MI.memoperands_begin(); |
| const unsigned ValSize = ValTy.getSizeInBits(); |
| const LLT MemTy = MMO->getMemoryType(); |
| const Align MemAlign = MMO->getAlign(); |
| const unsigned MemSize = MemTy.getSizeInBits(); |
| const unsigned AlignInBits = 8 * MemAlign.value(); |
| |
| // Widen non-power-of-2 loads to the alignment if needed |
| if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { |
| const unsigned WideMemSize = PowerOf2Ceil(MemSize); |
| |
| // This was already the correct extending load result type, so just adjust |
| // the memory type. |
| if (WideMemSize == ValSize) { |
| MachineFunction &MF = B.getMF(); |
| |
| MachineMemOperand *WideMMO = |
| MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); |
| Observer.changingInstr(MI); |
| MI.setMemRefs(MF, {WideMMO}); |
| Observer.changedInstr(MI); |
| return true; |
| } |
| |
| // Don't bother handling edge case that should probably never be produced. |
| if (ValSize > WideMemSize) |
| return false; |
| |
| LLT WideTy = widenToNextPowerOf2(ValTy); |
| |
| Register WideLoad; |
| if (!WideTy.isVector()) { |
| WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); |
| B.buildTrunc(ValReg, WideLoad).getReg(0); |
| } else { |
| // Extract the subvector. |
| |
| if (isRegisterType(ValTy)) { |
| // If this a case where G_EXTRACT is legal, use it. |
| // (e.g. <3 x s32> -> <4 x s32>) |
| WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); |
| B.buildExtract(ValReg, WideLoad, 0); |
| } else { |
| // For cases where the widened type isn't a nice register value, unmerge |
| // from a widened register (e.g. <3 x s16> -> <4 x s16>) |
| B.setInsertPt(B.getMBB(), ++B.getInsertPt()); |
| WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); |
| B.setInsertPt(B.getMBB(), MI.getIterator()); |
| B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); |
| } |
| } |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFMad( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| LLT Ty = MRI.getType(MI.getOperand(0).getReg()); |
| assert(Ty.isScalar()); |
|