| //===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This pass lowers operations on buffer fat pointers (addrspace 7) to |
| // operations on buffer resources (addrspace 8) and is needed for correct |
| // codegen. |
| // |
| // # Background |
| // |
| // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists |
| // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. |
| // The buffer resource part needs to be it needs to be a "raw" buffer resource |
| // (it must have a stride of 0 and bounds checks must be in raw buffer mode |
| // or disabled). |
| // |
| // When these requirements are met, a buffer resource can be treated as a |
| // typical (though quite wide) pointer that follows typical LLVM pointer |
| // semantics. This allows the frontend to reason about such buffers (which are |
| // often encountered in the context of SPIR-V kernels). |
| // |
| // However, because of their non-power-of-2 size, these fat pointers cannot be |
| // present during translation to MIR (though this restriction may be lifted |
| // during the transition to GlobalISel). Therefore, this pass is needed in order |
| // to correctly implement these fat pointers. |
| // |
| // The resource intrinsics take the resource part (the address space 8 pointer) |
| // and the offset part (the 32-bit integer) as separate arguments. In addition, |
| // many users of these buffers manipulate the offset while leaving the resource |
| // part alone. For these reasons, we want to typically separate the resource |
| // and offset parts into separate variables, but combine them together when |
| // encountering cases where this is required, such as by inserting these values |
| // into aggretates or moving them to memory. |
| // |
| // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) |
| // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), |
| // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes |
| // `{vector<Nxp8>, vector<Nxi32 >}` and its component parts. |
| // |
| // # Implementation |
| // |
| // This pass proceeds in three main phases: |
| // |
| // ## Rewriting loads and stores of p7 and memcpy()-like handling |
| // |
| // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, |
| // including aggregates containing such pointers, to ones that use `i160`. This |
| // is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits |
| // loads, stores, and allocas and, if the loaded or stored type contains `ptr |
| // addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s, |
| // copying other parts of aggregates as needed. In the case of a store, each |
| // pointer is `ptrtoint`d to i160 before storing, and load integers are |
| // `inttoptr`d back. This same transformation is applied to vectors of pointers. |
| // |
| // Such a transformation allows the later phases of the pass to not need |
| // to handle buffer fat pointers moving to and from memory, where we load |
| // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation |
| // and `Nxi60` directly. Instead, that transposing action (where the vectors |
| // of resources and vectors of offsets are concatentated before being stored to |
| // memory) are handled through implementing `inttoptr` and `ptrtoint` only. |
| // |
| // Atomics operations on `ptr addrspace(7)` values are not suppported, as the |
| // hardware does not include a 160-bit atomic. |
| // |
| // In order to save on O(N) work and to ensure that the contents type |
| // legalizer correctly splits up wide loads, also unconditionally lower |
| // memcpy-like intrinsics into loops here. |
| // |
| // ## Buffer contents type legalization |
| // |
| // The underlying buffer intrinsics only support types up to 128 bits long, |
| // and don't support complex types. If buffer operations were |
| // standard pointer operations that could be represented as MIR-level loads, |
| // this would be handled by the various legalization schemes in instruction |
| // selection. However, because we have to do the conversion from `load` and |
| // `store` to intrinsics at LLVM IR level, we must perform that legalization |
| // ourselves. |
| // |
| // This involves a combination of |
| // - Converting arrays to vectors where possible |
| // - Otherwise, splitting loads and stores of aggregates into loads/stores of |
| // each component. |
| // - Zero-extending things to fill a whole number of bytes |
| // - Casting values of types that don't neatly correspond to supported machine |
| // value |
| // (for example, an i96 or i256) into ones that would work ( |
| // like <3 x i32> and <8 x i32>, respectively) |
| // - Splitting values that are too long (such as aforementioned <8 x i32>) into |
| // multiple operations. |
| // |
| // ## Type remapping |
| // |
| // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers |
| // to the corresponding struct type, which has a resource part and an offset |
| // part. |
| // |
| // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` |
| // to, usually by way of `setType`ing values. Constants are handled here |
| // because there isn't a good way to fix them up later. |
| // |
| // This has the downside of leaving the IR in an invalid state (for example, |
| // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), |
| // but all such invalid states will be resolved by the third phase. |
| // |
| // Functions that don't take buffer fat pointers are modified in place. Those |
| // that do take such pointers have their basic blocks moved to a new function |
| // with arguments that are {ptr addrspace(8), i32} arguments and return values. |
| // This phase also records intrinsics so that they can be remangled or deleted |
| // later. |
| // |
| // ## Splitting pointer structs |
| // |
| // The meat of this pass consists of defining semantics for operations that |
| // produce or consume [vectors of] buffer fat pointers in terms of their |
| // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` |
| // visitor. |
| // |
| // In the first pass through each function that is being lowered, the splitter |
| // inserts new instructions to implement the split-structures behavior, which is |
| // needed for correctness and performance. It records a list of "split users", |
| // instructions that are being replaced by operations on the resource and offset |
| // parts. |
| // |
| // Split users do not necessarily need to produce parts themselves ( |
| // a `load float, ptr addrspace(7)` does not, for example), but, if they do not |
| // generate fat buffer pointers, they must RAUW in their replacement |
| // instructions during the initial visit. |
| // |
| // When these new instructions are created, they use the split parts recorded |
| // for their initial arguments in order to generate their replacements, creating |
| // a parallel set of instructions that does not refer to the original fat |
| // pointer values but instead to their resource and offset components. |
| // |
| // Instructions, such as `extractvalue`, that produce buffer fat pointers from |
| // sources that do not have split parts, have such parts generated using |
| // `extractvalue`. This is also the initial handling of PHI nodes, which |
| // are then cleaned up. |
| // |
| // ### Conditionals |
| // |
| // PHI nodes are initially given resource parts via `extractvalue`. However, |
| // this is not an efficient rewrite of such nodes, as, in most cases, the |
| // resource part in a conditional or loop remains constant throughout the loop |
| // and only the offset varies. Failing to optimize away these constant resources |
| // would cause additional registers to be sent around loops and might lead to |
| // waterfall loops being generated for buffer operations due to the |
| // "non-uniform" resource argument. |
| // |
| // Therefore, after all instructions have been visited, the pointer splitter |
| // post-processes all encountered conditionals. Given a PHI node or select, |
| // getPossibleRsrcRoots() collects all values that the resource parts of that |
| // conditional's input could come from as well as collecting all conditional |
| // instructions encountered during the search. If, after filtering out the |
| // initial node itself, the set of encountered conditionals is a subset of the |
| // potential roots and there is a single potential resource that isn't in the |
| // conditional set, that value is the only possible value the resource argument |
| // could have throughout the control flow. |
| // |
| // If that condition is met, then a PHI node can have its resource part changed |
| // to the singleton value and then be replaced by a PHI on the offsets. |
| // Otherwise, each PHI node is split into two, one for the resource part and one |
| // for the offset part, which replace the temporary `extractvalue` instructions |
| // that were added during the first pass. |
| // |
| // Similar logic applies to `select`, where |
| // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` |
| // can be split into `%z.rsrc = %x.rsrc` and |
| // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` |
| // if both `%x` and `%y` have the same resource part, but two `select` |
| // operations will be needed if they do not. |
| // |
| // ### Final processing |
| // |
| // After conditionals have been cleaned up, the IR for each function is |
| // rewritten to remove all the old instructions that have been split up. |
| // |
| // Any instruction that used to produce a buffer fat pointer (and therefore now |
| // produces a resource-and-offset struct after type remapping) is |
| // replaced as follows: |
| // 1. All debug value annotations are cloned to reflect that the resource part |
| // and offset parts are computed separately and constitute different |
| // fragments of the underlying source language variable. |
| // 2. All uses that were themselves split are replaced by a `poison` of the |
| // struct type, as they will themselves be erased soon. This rule, combined |
| // with debug handling, should leave the use lists of split instructions |
| // empty in almost all cases. |
| // 3. If a user of the original struct-valued result remains, the structure |
| // needed for the new types to work is constructed out of the newly-defined |
| // parts, and the original instruction is replaced by this structure |
| // before being erased. Instructions requiring this construction include |
| // `ret` and `insertvalue`. |
| // |
| // # Consequences |
| // |
| // This pass does not alter the CFG. |
| // |
| // Alias analysis information will become coarser, as the LLVM alias analyzer |
| // cannot handle the buffer intrinsics. Specifically, while we can determine |
| // that the following two loads do not alias: |
| // ``` |
| // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 |
| // %a = load i32, ptr addrspace(7) %x |
| // %b = load i32, ptr addrspace(7) %y |
| // ``` |
| // we cannot (except through some code that runs during scheduling) determine |
| // that the rewritten loads below do not alias. |
| // ``` |
| // %y.off = add i32 %x.off, 1 |
| // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 |
| // %x.off, ...) |
| // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) |
| // %x.rsrc, i32 %y.off, ...) |
| // ``` |
| // However, existing alias information is preserved. |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPU.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "GCNSubtarget.h" |
| #include "SIDefines.h" |
| #include "llvm/ADT/SetOperations.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/Analysis/InstSimplifyFolder.h" |
| #include "llvm/Analysis/Utils/Local.h" |
| #include "llvm/CodeGen/TargetPassConfig.h" |
| #include "llvm/IR/AttributeMask.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DebugInfo.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstIterator.h" |
| #include "llvm/IR/InstVisitor.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| #include "llvm/IR/Metadata.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/ReplaceConstant.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/InitializePasses.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/AMDGPUAddrSpace.h" |
| #include "llvm/Support/Alignment.h" |
| #include "llvm/Support/AtomicOrdering.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Transforms/Utils/Cloning.h" |
| #include "llvm/Transforms/Utils/Local.h" |
| #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" |
| #include "llvm/Transforms/Utils/ValueMapper.h" |
| |
| #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" |
| |
| using namespace llvm; |
| |
| static constexpr unsigned BufferOffsetWidth = 32; |
| |
| namespace { |
| /// Recursively replace instances of ptr addrspace(7) and vector<Nxptr |
| /// addrspace(7)> with some other type as defined by the relevant subclass. |
| class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { |
| DenseMap<Type *, Type *> Map; |
| |
| Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl<StructType *> &Seen); |
| |
| protected: |
| virtual Type *remapScalar(PointerType *PT) = 0; |
| virtual Type *remapVector(VectorType *VT) = 0; |
| |
| const DataLayout &DL; |
| |
| public: |
| BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} |
| Type *remapType(Type *SrcTy) override; |
| void clear() { Map.clear(); } |
| }; |
| |
| /// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to |
| /// vector<Nxi60> in order to correctly handling loading/storing these values |
| /// from memory. |
| class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { |
| using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; |
| |
| protected: |
| Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } |
| Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } |
| }; |
| |
| /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset |
| /// parts of the pointer) so that we can easily rewrite operations on these |
| /// values that aren't loading them from or storing them to memory. |
| class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { |
| using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; |
| |
| protected: |
| Type *remapScalar(PointerType *PT) override; |
| Type *remapVector(VectorType *VT) override; |
| }; |
| } // namespace |
| |
| // This code is adapted from the type remapper in lib/Linker/IRMover.cpp |
| Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( |
| Type *Ty, SmallPtrSetImpl<StructType *> &Seen) { |
| Type **Entry = &Map[Ty]; |
| if (*Entry) |
| return *Entry; |
| if (auto *PT = dyn_cast<PointerType>(Ty)) { |
| if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { |
| return *Entry = remapScalar(PT); |
| } |
| } |
| if (auto *VT = dyn_cast<VectorType>(Ty)) { |
| auto *PT = dyn_cast<PointerType>(VT->getElementType()); |
| if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { |
| return *Entry = remapVector(VT); |
| } |
| return *Entry = Ty; |
| } |
| // Whether the type is one that is structurally uniqued - that is, if it is |
| // not a named struct (the only kind of type where multiple structurally |
| // identical types that have a distinct `Type*`) |
| StructType *TyAsStruct = dyn_cast<StructType>(Ty); |
| bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral(); |
| // Base case for ints, floats, opaque pointers, and so on, which don't |
| // require recursion. |
| if (Ty->getNumContainedTypes() == 0 && IsUniqued) |
| return *Entry = Ty; |
| if (!IsUniqued) { |
| // Create a dummy type for recursion purposes. |
| if (!Seen.insert(TyAsStruct).second) { |
| StructType *Placeholder = StructType::create(Ty->getContext()); |
| return *Entry = Placeholder; |
| } |
| } |
| bool Changed = false; |
| SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr); |
| for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { |
| Type *OldElem = Ty->getContainedType(I); |
| Type *NewElem = remapTypeImpl(OldElem, Seen); |
| ElementTypes[I] = NewElem; |
| Changed |= (OldElem != NewElem); |
| } |
| // Recursive calls to remapTypeImpl() may have invalidated pointer. |
| Entry = &Map[Ty]; |
| if (!Changed) { |
| return *Entry = Ty; |
| } |
| if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) |
| return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); |
| if (auto *FnTy = dyn_cast<FunctionType>(Ty)) |
| return *Entry = FunctionType::get(ElementTypes[0], |
| ArrayRef(ElementTypes).slice(1), |
| FnTy->isVarArg()); |
| if (auto *STy = dyn_cast<StructType>(Ty)) { |
| // Genuine opaque types don't have a remapping. |
| if (STy->isOpaque()) |
| return *Entry = Ty; |
| bool IsPacked = STy->isPacked(); |
| if (IsUniqued) |
| return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); |
| SmallString<16> Name(STy->getName()); |
| STy->setName(""); |
| Type **RecursionEntry = &Map[Ty]; |
| if (*RecursionEntry) { |
| auto *Placeholder = cast<StructType>(*RecursionEntry); |
| Placeholder->setBody(ElementTypes, IsPacked); |
| Placeholder->setName(Name); |
| return *Entry = Placeholder; |
| } |
| return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, |
| IsPacked); |
| } |
| llvm_unreachable("Unknown type of type that contains elements"); |
| } |
| |
| Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { |
| SmallPtrSet<StructType *, 2> Visited; |
| return remapTypeImpl(SrcTy, Visited); |
| } |
| |
| Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { |
| LLVMContext &Ctx = PT->getContext(); |
| return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), |
| IntegerType::get(Ctx, BufferOffsetWidth)); |
| } |
| |
| Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { |
| ElementCount EC = VT->getElementCount(); |
| LLVMContext &Ctx = VT->getContext(); |
| Type *RsrcVec = |
| VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); |
| Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); |
| return StructType::get(RsrcVec, OffVec); |
| } |
| |
| static bool isBufferFatPtrOrVector(Type *Ty) { |
| if (auto *PT = dyn_cast<PointerType>(Ty->getScalarType())) |
| return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; |
| return false; |
| } |
| |
| // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of |
| // those types. Used to quickly skip instructions we don't need to process. |
| static bool isSplitFatPtr(Type *Ty) { |
| auto *ST = dyn_cast<StructType>(Ty); |
| if (!ST) |
| return false; |
| if (!ST->isLiteral() || ST->getNumElements() != 2) |
| return false; |
| auto *MaybeRsrc = |
| dyn_cast<PointerType>(ST->getElementType(0)->getScalarType()); |
| auto *MaybeOff = |
| dyn_cast<IntegerType>(ST->getElementType(1)->getScalarType()); |
| return MaybeRsrc && MaybeOff && |
| MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && |
| MaybeOff->getBitWidth() == BufferOffsetWidth; |
| } |
| |
| // True if the result type or any argument types are buffer fat pointers. |
| static bool isBufferFatPtrConst(Constant *C) { |
| Type *T = C->getType(); |
| return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) { |
| return isBufferFatPtrOrVector(U.get()->getType()); |
| }); |
| } |
| |
| namespace { |
| /// Convert [vectors of] buffer fat pointers to integers when they are read from |
| /// or stored to memory. This ensures that these pointers will have the same |
| /// memory layout as before they are lowered, even though they will no longer |
| /// have their previous layout in registers/in the program (they'll be broken |
| /// down into resource and offset parts). This has the downside of imposing |
| /// marshalling costs when reading or storing these values, but since placing |
| /// such pointers into memory is an uncommon operation at best, we feel that |
| /// this cost is acceptable for better performance in the common case. |
| class StoreFatPtrsAsIntsAndExpandMemcpyVisitor |
| : public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> { |
| BufferFatPtrToIntTypeMap *TypeMap; |
| |
| ValueToValueMapTy ConvertedForStore; |
| |
| IRBuilder<InstSimplifyFolder> IRB; |
| |
| const TargetMachine *TM; |
| |
| // Convert all the buffer fat pointers within the input value to inttegers |
| // so that it can be stored in memory. |
| Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); |
| // Convert all the i160s that need to be buffer fat pointers (as specified) |
| // by the To type) into those pointers to preserve the semantics of the rest |
| // of the program. |
| Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); |
| |
| public: |
| StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap, |
| const DataLayout &DL, |
| LLVMContext &Ctx, |
| const TargetMachine *TM) |
| : TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {} |
| bool processFunction(Function &F); |
| |
| bool visitInstruction(Instruction &I) { return false; } |
| bool visitAllocaInst(AllocaInst &I); |
| bool visitLoadInst(LoadInst &LI); |
| bool visitStoreInst(StoreInst &SI); |
| bool visitGetElementPtrInst(GetElementPtrInst &I); |
| |
| bool visitMemCpyInst(MemCpyInst &MCI); |
| bool visitMemMoveInst(MemMoveInst &MMI); |
| bool visitMemSetInst(MemSetInst &MSI); |
| bool visitMemSetPatternInst(MemSetPatternInst &MSPI); |
| }; |
| } // namespace |
| |
| Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts( |
| Value *V, Type *From, Type *To, const Twine &Name) { |
| if (From == To) |
| return V; |
| ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); |
| if (Find != ConvertedForStore.end()) |
| return Find->second; |
| if (isBufferFatPtrOrVector(From)) { |
| Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); |
| ConvertedForStore[V] = Cast; |
| return Cast; |
| } |
| if (From->getNumContainedTypes() == 0) |
| return V; |
| // Structs, arrays, and other compound types. |
| Value *Ret = PoisonValue::get(To); |
| if (auto *AT = dyn_cast<ArrayType>(From)) { |
| Type *FromPart = AT->getArrayElementType(); |
| Type *ToPart = cast<ArrayType>(To)->getElementType(); |
| for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { |
| Value *Field = IRB.CreateExtractValue(V, I); |
| Value *NewField = |
| fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); |
| Ret = IRB.CreateInsertValue(Ret, NewField, I); |
| } |
| } else { |
| for (auto [Idx, FromPart, ToPart] : |
| enumerate(From->subtypes(), To->subtypes())) { |
| Value *Field = IRB.CreateExtractValue(V, Idx); |
| Value *NewField = |
| fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); |
| Ret = IRB.CreateInsertValue(Ret, NewField, Idx); |
| } |
| } |
| ConvertedForStore[V] = Ret; |
| return Ret; |
| } |
| |
| Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs( |
| Value *V, Type *From, Type *To, const Twine &Name) { |
| if (From == To) |
| return V; |
| if (isBufferFatPtrOrVector(To)) { |
| Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); |
| return Cast; |
| } |
| if (From->getNumContainedTypes() == 0) |
| return V; |
| // Structs, arrays, and other compound types. |
| Value *Ret = PoisonValue::get(To); |
| if (auto *AT = dyn_cast<ArrayType>(From)) { |
| Type *FromPart = AT->getArrayElementType(); |
| Type *ToPart = cast<ArrayType>(To)->getElementType(); |
| for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { |
| Value *Field = IRB.CreateExtractValue(V, I); |
| Value *NewField = |
| intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); |
| Ret = IRB.CreateInsertValue(Ret, NewField, I); |
| } |
| } else { |
| for (auto [Idx, FromPart, ToPart] : |
| enumerate(From->subtypes(), To->subtypes())) { |
| Value *Field = IRB.CreateExtractValue(V, Idx); |
| Value *NewField = |
| intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); |
| Ret = IRB.CreateInsertValue(Ret, NewField, Idx); |
| } |
| } |
| return Ret; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) { |
| bool Changed = false; |
| // Process memcpy-like instructions after the main iteration because they can |
| // invalidate iterators. |
| SmallVector<WeakTrackingVH> CanBecomeLoops; |
| for (Instruction &I : make_early_inc_range(instructions(F))) { |
| if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I)) |
| CanBecomeLoops.push_back(&I); |
| else |
| Changed |= visit(I); |
| } |
| for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) { |
| Changed |= visit(cast<Instruction>(VH)); |
| } |
| ConvertedForStore.clear(); |
| return Changed; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) { |
| Type *Ty = I.getAllocatedType(); |
| Type *NewTy = TypeMap->remapType(Ty); |
| if (Ty == NewTy) |
| return false; |
| I.setAllocatedType(NewTy); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst( |
| GetElementPtrInst &I) { |
| Type *Ty = I.getSourceElementType(); |
| Type *NewTy = TypeMap->remapType(Ty); |
| if (Ty == NewTy) |
| return false; |
| // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so |
| // make sure GEPs don't have different semantics with the new type. |
| I.setSourceElementType(NewTy); |
| I.setResultElementType(TypeMap->remapType(I.getResultElementType())); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) { |
| Type *Ty = LI.getType(); |
| Type *IntTy = TypeMap->remapType(Ty); |
| if (Ty == IntTy) |
| return false; |
| |
| IRB.SetInsertPoint(&LI); |
| auto *NLI = cast<LoadInst>(LI.clone()); |
| NLI->mutateType(IntTy); |
| NLI = IRB.Insert(NLI); |
| NLI->takeName(&LI); |
| |
| Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); |
| LI.replaceAllUsesWith(CastBack); |
| LI.eraseFromParent(); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) { |
| Value *V = SI.getValueOperand(); |
| Type *Ty = V->getType(); |
| Type *IntTy = TypeMap->remapType(Ty); |
| if (Ty == IntTy) |
| return false; |
| |
| IRB.SetInsertPoint(&SI); |
| Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); |
| for (auto *Dbg : at::getAssignmentMarkers(&SI)) |
| Dbg->setValue(IntV); |
| |
| SI.setOperand(0, IntV); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst( |
| MemCpyInst &MCI) { |
| // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll |
| // need loop expansion here. |
| if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER && |
| MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| llvm::expandMemCpyAsLoop(&MCI, |
| TM->getTargetTransformInfo(*MCI.getFunction())); |
| MCI.eraseFromParent(); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst( |
| MemMoveInst &MMI) { |
| if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER && |
| MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| report_fatal_error( |
| "memmove() on buffer descriptors is not implemented because pointer " |
| "comparison on buffer descriptors isn't implemented\n"); |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst( |
| MemSetInst &MSI) { |
| if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| llvm::expandMemSetAsLoop(&MSI); |
| MSI.eraseFromParent(); |
| return true; |
| } |
| |
| bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst( |
| MemSetPatternInst &MSPI) { |
| if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| llvm::expandMemSetPatternAsLoop(&MSPI); |
| MSPI.eraseFromParent(); |
| return true; |
| } |
| |
| namespace { |
| /// Convert loads/stores of types that the buffer intrinsics can't handle into |
| /// one ore more such loads/stores that consist of legal types. |
| /// |
| /// Do this by |
| /// 1. Recursing into structs (and arrays that don't share a memory layout with |
| /// vectors) since the intrinsics can't handle complex types. |
| /// 2. Converting arrays of non-aggregate, byte-sized types into their |
| /// corresponding vectors |
| /// 3. Bitcasting unsupported types, namely overly-long scalars and byte |
| /// vectors, into vectors of supported types. |
| /// 4. Splitting up excessively long reads/writes into multiple operations. |
| /// |
| /// Note that this doesn't handle complex data strucures, but, in the future, |
| /// the aggregate load splitter from SROA could be refactored to allow for that |
| /// case. |
| class LegalizeBufferContentTypesVisitor |
| : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> { |
| friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>; |
| |
| IRBuilder<InstSimplifyFolder> IRB; |
| |
| const DataLayout &DL; |
| |
| /// If T is [N x U], where U is a scalar type, return the vector type |
| /// <N x U>, otherwise, return T. |
| Type *scalarArrayTypeAsVector(Type *MaybeArrayType); |
| Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name); |
| Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name); |
| |
| /// Break up the loads of a struct into the loads of its components |
| |
| /// Convert a vector or scalar type that can't be operated on by buffer |
| /// intrinsics to one that would be legal through bitcasts and/or truncation. |
| /// Uses the wider of i32, i16, or i8 where possible. |
| Type *legalNonAggregateFor(Type *T); |
| Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name); |
| Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name); |
| |
| struct VecSlice { |
| uint64_t Index = 0; |
| uint64_t Length = 0; |
| VecSlice() = delete; |
| // Needed for some Clangs |
| VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {} |
| }; |
| /// Return the [index, length] pairs into which `T` needs to be cut to form |
| /// legal buffer load or store operations. Clears `Slices`. Creates an empty |
| /// `Slices` for non-vector inputs and creates one slice if no slicing will be |
| /// needed. |
| void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices); |
| |
| Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name); |
| Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name); |
| |
| /// In most cases, return `LegalType`. However, when given an input that would |
| /// normally be a legal type for the buffer intrinsics to return but that |
| /// isn't hooked up through SelectionDAG, return a type of the same width that |
| /// can be used with the relevant intrinsics. Specifically, handle the cases: |
| /// - <1 x T> => T for all T |
| /// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed) |
| /// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x |
| /// i32> |
| Type *intrinsicTypeFor(Type *LegalType); |
| |
| bool visitLoadImpl(LoadInst &OrigLI, Type *PartType, |
| SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset, |
| Value *&Result, const Twine &Name); |
| /// Return value is (Changed, ModifiedInPlace) |
| std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType, |
| SmallVectorImpl<uint32_t> &AggIdxs, |
| uint64_t AggByteOffset, |
| const Twine &Name); |
| |
| bool visitInstruction(Instruction &I) { return false; } |
| bool visitLoadInst(LoadInst &LI); |
| bool visitStoreInst(StoreInst &SI); |
| |
| public: |
| LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx) |
| : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {} |
| bool processFunction(Function &F); |
| }; |
| } // namespace |
| |
| Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) { |
| ArrayType *AT = dyn_cast<ArrayType>(T); |
| if (!AT) |
| return T; |
| Type *ET = AT->getElementType(); |
| if (!ET->isSingleValueType() || isa<VectorType>(ET)) |
| report_fatal_error("loading non-scalar arrays from buffer fat pointers " |
| "should have recursed"); |
| if (!DL.typeSizeEqualsStoreSize(AT)) |
| report_fatal_error( |
| "loading padded arrays from buffer fat pinters should have recursed"); |
| return FixedVectorType::get(ET, AT->getNumElements()); |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V, |
| Type *TargetType, |
| const Twine &Name) { |
| Value *VectorRes = PoisonValue::get(TargetType); |
| auto *VT = cast<FixedVectorType>(TargetType); |
| unsigned EC = VT->getNumElements(); |
| for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) { |
| Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I)); |
| VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I, |
| Name + ".as.vec." + Twine(I)); |
| } |
| return VectorRes; |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V, |
| Type *OrigType, |
| const Twine &Name) { |
| Value *ArrayRes = PoisonValue::get(OrigType); |
| ArrayType *AT = cast<ArrayType>(OrigType); |
| unsigned EC = AT->getNumElements(); |
| for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) { |
| Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I)); |
| ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I, |
| Name + ".as.array." + Twine(I)); |
| } |
| return ArrayRes; |
| } |
| |
| Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) { |
| TypeSize Size = DL.getTypeStoreSizeInBits(T); |
| // Implicitly zero-extend to the next byte if needed |
| if (!DL.typeSizeEqualsStoreSize(T)) |
| T = IRB.getIntNTy(Size.getFixedValue()); |
| Type *ElemTy = T->getScalarType(); |
| if (isa<PointerType, ScalableVectorType>(ElemTy)) { |
| // Pointers are always big enough, and we'll let scalable vectors through to |
| // fail in codegen. |
| return T; |
| } |
| unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue(); |
| if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) { |
| // [vectors of] anything that's 16/32/64/128 bits can be cast and split into |
| // legal buffer operations. |
| return T; |
| } |
| Type *BestVectorElemType = nullptr; |
| if (Size.isKnownMultipleOf(32)) |
| BestVectorElemType = IRB.getInt32Ty(); |
| else if (Size.isKnownMultipleOf(16)) |
| BestVectorElemType = IRB.getInt16Ty(); |
| else |
| BestVectorElemType = IRB.getInt8Ty(); |
| unsigned NumCastElems = |
| Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth(); |
| if (NumCastElems == 1) |
| return BestVectorElemType; |
| return FixedVectorType::get(BestVectorElemType, NumCastElems); |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate( |
| Value *V, Type *TargetType, const Twine &Name) { |
| Type *SourceType = V->getType(); |
| TypeSize SourceSize = DL.getTypeSizeInBits(SourceType); |
| TypeSize TargetSize = DL.getTypeSizeInBits(TargetType); |
| if (SourceSize != TargetSize) { |
| Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue()); |
| Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue()); |
| Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar"); |
| Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext"); |
| V = Zext; |
| SourceType = ByteScalarTy; |
| } |
| return IRB.CreateBitCast(V, TargetType, Name + ".legal"); |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate( |
| Value *V, Type *OrigType, const Twine &Name) { |
| Type *LegalType = V->getType(); |
| TypeSize LegalSize = DL.getTypeSizeInBits(LegalType); |
| TypeSize OrigSize = DL.getTypeSizeInBits(OrigType); |
| if (LegalSize != OrigSize) { |
| Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue()); |
| Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue()); |
| Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast"); |
| Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc"); |
| return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig"); |
| } |
| return IRB.CreateBitCast(V, OrigType, Name + ".real.ty"); |
| } |
| |
| Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) { |
| auto *VT = dyn_cast<FixedVectorType>(LegalType); |
| if (!VT) |
| return LegalType; |
| Type *ET = VT->getElementType(); |
| // Explicitly return the element type of 1-element vectors because the |
| // underlying intrinsics don't like <1 x T> even though it's a synonym for T. |
| if (VT->getNumElements() == 1) |
| return ET; |
| if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32) |
| return FixedVectorType::get(IRB.getInt32Ty(), 3); |
| if (ET->isIntegerTy(8)) { |
| switch (VT->getNumElements()) { |
| default: |
| return LegalType; // Let it crash later |
| case 1: |
| return IRB.getInt8Ty(); |
| case 2: |
| return IRB.getInt16Ty(); |
| case 4: |
| return IRB.getInt32Ty(); |
| case 8: |
| return FixedVectorType::get(IRB.getInt32Ty(), 2); |
| case 16: |
| return FixedVectorType::get(IRB.getInt32Ty(), 4); |
| } |
| } |
| return LegalType; |
| } |
| |
| void LegalizeBufferContentTypesVisitor::getVecSlices( |
| Type *T, SmallVectorImpl<VecSlice> &Slices) { |
| Slices.clear(); |
| auto *VT = dyn_cast<FixedVectorType>(T); |
| if (!VT) |
| return; |
| |
| uint64_t ElemBitWidth = |
| DL.getTypeSizeInBits(VT->getElementType()).getFixedValue(); |
| |
| uint64_t ElemsPer4Words = 128 / ElemBitWidth; |
| uint64_t ElemsPer2Words = ElemsPer4Words / 2; |
| uint64_t ElemsPerWord = ElemsPer2Words / 2; |
| uint64_t ElemsPerShort = ElemsPerWord / 2; |
| uint64_t ElemsPerByte = ElemsPerShort / 2; |
| // If the elements evenly pack into 32-bit words, we can use 3-word stores, |
| // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for |
| // example, <3 x i64>, since that's not slicing. |
| uint64_t ElemsPer3Words = ElemsPerWord * 3; |
| |
| uint64_t TotalElems = VT->getNumElements(); |
| uint64_t Index = 0; |
| auto TrySlice = [&](unsigned MaybeLen) { |
| if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) { |
| VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen}; |
| Slices.push_back(Slice); |
| Index += MaybeLen; |
| return true; |
| } |
| return false; |
| }; |
| while (Index < TotalElems) { |
| TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) || |
| TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) || |
| TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte); |
| } |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S, |
| const Twine &Name) { |
| auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType()); |
| if (!VecVT) |
| return Vec; |
| if (S.Length == VecVT->getNumElements() && S.Index == 0) |
| return Vec; |
| if (S.Length == 1) |
| return IRB.CreateExtractElement(Vec, S.Index, |
| Name + ".slice." + Twine(S.Index)); |
| SmallVector<int> Mask = llvm::to_vector( |
| llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false)); |
| return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index)); |
| } |
| |
| Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part, |
| VecSlice S, |
| const Twine &Name) { |
| auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType()); |
| if (!WholeVT) |
| return Part; |
| if (S.Length == WholeVT->getNumElements() && S.Index == 0) |
| return Part; |
| if (S.Length == 1) { |
| return IRB.CreateInsertElement(Whole, Part, S.Index, |
| Name + ".slice." + Twine(S.Index)); |
| } |
| int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements(); |
| |
| // Extend the slice with poisons to make the main shufflevector happy. |
| SmallVector<int> ExtPartMask(NumElems, -1); |
| for (auto [I, E] : llvm::enumerate( |
| MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) { |
| E = I; |
| } |
| Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask, |
| Name + ".ext." + Twine(S.Index)); |
| |
| SmallVector<int> Mask = |
| llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false)); |
| for (auto [I, E] : |
| llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length))) |
| E = I + NumElems; |
| return IRB.CreateShuffleVector(Whole, ExtPart, Mask, |
| Name + ".parts." + Twine(S.Index)); |
| } |
| |
| bool LegalizeBufferContentTypesVisitor::visitLoadImpl( |
| LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs, |
| uint64_t AggByteOff, Value *&Result, const Twine &Name) { |
| if (auto *ST = dyn_cast<StructType>(PartType)) { |
| const StructLayout *Layout = DL.getStructLayout(ST); |
| bool Changed = false; |
| for (auto [I, ElemTy, Offset] : |
| llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) { |
| AggIdxs.push_back(I); |
| Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs, |
| AggByteOff + Offset.getFixedValue(), Result, |
| Name + "." + Twine(I)); |
| AggIdxs.pop_back(); |
| } |
| return Changed; |
| } |
| if (auto *AT = dyn_cast<ArrayType>(PartType)) { |
| Type *ElemTy = AT->getElementType(); |
| if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) || |
| ElemTy->isVectorTy()) { |
| TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy); |
| bool Changed = false; |
| for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(), |
| /*Inclusive=*/false)) { |
| AggIdxs.push_back(I); |
| Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs, |
| AggByteOff + I * ElemStoreSize.getFixedValue(), |
| Result, Name + Twine(I)); |
| AggIdxs.pop_back(); |
| } |
| return Changed; |
| } |
| } |
| |
| // Typical case |
| |
| Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType); |
| Type *LegalType = legalNonAggregateFor(ArrayAsVecType); |
| |
| SmallVector<VecSlice> Slices; |
| getVecSlices(LegalType, Slices); |
| bool HasSlices = Slices.size() > 1; |
| bool IsAggPart = !AggIdxs.empty(); |
| Value *LoadsRes; |
| if (!HasSlices && !IsAggPart) { |
| Type *LoadableType = intrinsicTypeFor(LegalType); |
| if (LoadableType == PartType) |
| return false; |
| |
| IRB.SetInsertPoint(&OrigLI); |
| auto *NLI = cast<LoadInst>(OrigLI.clone()); |
| NLI->mutateType(LoadableType); |
| NLI = IRB.Insert(NLI); |
| NLI->setName(Name + ".loadable"); |
| |
| LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable"); |
| } else { |
| IRB.SetInsertPoint(&OrigLI); |
| LoadsRes = PoisonValue::get(LegalType); |
| Value *OrigPtr = OrigLI.getPointerOperand(); |
| // If we're needing to spill something into more than one load, its legal |
| // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>). |
| // But if we're already a scalar (which can happen if we're splitting up a |
| // struct), the element type will be the legal type itself. |
| Type *ElemType = LegalType->getScalarType(); |
| unsigned ElemBytes = DL.getTypeStoreSize(ElemType); |
| AAMDNodes AANodes = OrigLI.getAAMetadata(); |
| if (IsAggPart && Slices.empty()) |
| Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1}); |
| for (VecSlice S : Slices) { |
| Type *SliceType = |
| S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType; |
| int64_t ByteOffset = AggByteOff + S.Index * ElemBytes; |
| // You can't reasonably expect loads to wrap around the edge of memory. |
| Value *NewPtr = IRB.CreateGEP( |
| IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset), |
| OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset), |
| GEPNoWrapFlags::noUnsignedWrap()); |
| Type *LoadableType = intrinsicTypeFor(SliceType); |
| LoadInst *NewLI = IRB.CreateAlignedLoad( |
| LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset), |
| Name + ".off." + Twine(ByteOffset)); |
| copyMetadataForLoad(*NewLI, OrigLI); |
| NewLI->setAAMetadata( |
| AANodes.adjustForAccess(ByteOffset, LoadableType, DL)); |
| NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID()); |
| NewLI->setVolatile(OrigLI.isVolatile()); |
| Value *Loaded = IRB.CreateBitCast(NewLI, SliceType, |
| NewLI->getName() + ".from.loadable"); |
| LoadsRes = insertSlice(LoadsRes, Loaded, S, Name); |
| } |
| } |
| if (LegalType != ArrayAsVecType) |
| LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name); |
| if (ArrayAsVecType != PartType) |
| LoadsRes = vectorToArray(LoadsRes, PartType, Name); |
| |
| if (IsAggPart) |
| Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name); |
| else |
| Result = LoadsRes; |
| return true; |
| } |
| |
| bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) { |
| if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| |
| SmallVector<uint32_t> AggIdxs; |
| Type *OrigType = LI.getType(); |
| Value *Result = PoisonValue::get(OrigType); |
| bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName()); |
| if (!Changed) |
| return false; |
| Result->takeName(&LI); |
| LI.replaceAllUsesWith(Result); |
| LI.eraseFromParent(); |
| return Changed; |
| } |
| |
| std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl( |
| StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs, |
| uint64_t AggByteOff, const Twine &Name) { |
| if (auto *ST = dyn_cast<StructType>(PartType)) { |
| const StructLayout *Layout = DL.getStructLayout(ST); |
| bool Changed = false; |
| for (auto [I, ElemTy, Offset] : |
| llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) { |
| AggIdxs.push_back(I); |
| Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs, |
| AggByteOff + Offset.getFixedValue(), |
| Name + "." + Twine(I))); |
| AggIdxs.pop_back(); |
| } |
| return std::make_pair(Changed, /*ModifiedInPlace=*/false); |
| } |
| if (auto *AT = dyn_cast<ArrayType>(PartType)) { |
| Type *ElemTy = AT->getElementType(); |
| if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) || |
| ElemTy->isVectorTy()) { |
| TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy); |
| bool Changed = false; |
| for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(), |
| /*Inclusive=*/false)) { |
| AggIdxs.push_back(I); |
| Changed |= std::get<0>(visitStoreImpl( |
| OrigSI, ElemTy, AggIdxs, |
| AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I))); |
| AggIdxs.pop_back(); |
| } |
| return std::make_pair(Changed, /*ModifiedInPlace=*/false); |
| } |
| } |
| |
| Value *OrigData = OrigSI.getValueOperand(); |
| Value *NewData = OrigData; |
| |
| bool IsAggPart = !AggIdxs.empty(); |
| if (IsAggPart) |
| NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name); |
| |
| Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType); |
| if (ArrayAsVecType != PartType) { |
| NewData = arrayToVector(NewData, ArrayAsVecType, Name); |
| } |
| |
| Type *LegalType = legalNonAggregateFor(ArrayAsVecType); |
| if (LegalType != ArrayAsVecType) { |
| NewData = makeLegalNonAggregate(NewData, LegalType, Name); |
| } |
| |
| SmallVector<VecSlice> Slices; |
| getVecSlices(LegalType, Slices); |
| bool NeedToSplit = Slices.size() > 1 || IsAggPart; |
| if (!NeedToSplit) { |
| Type *StorableType = intrinsicTypeFor(LegalType); |
| if (StorableType == PartType) |
| return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false); |
| NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable"); |
| OrigSI.setOperand(0, NewData); |
| return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true); |
| } |
| |
| Value *OrigPtr = OrigSI.getPointerOperand(); |
| Type *ElemType = LegalType->getScalarType(); |
| if (IsAggPart && Slices.empty()) |
| Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1}); |
| unsigned ElemBytes = DL.getTypeStoreSize(ElemType); |
| AAMDNodes AANodes = OrigSI.getAAMetadata(); |
| for (VecSlice S : Slices) { |
| Type *SliceType = |
| S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType; |
| int64_t ByteOffset = AggByteOff + S.Index * ElemBytes; |
| Value *NewPtr = |
| IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset), |
| OrigPtr->getName() + ".part." + Twine(S.Index), |
| GEPNoWrapFlags::noUnsignedWrap()); |
| Value *DataSlice = extractSlice(NewData, S, Name); |
| Type *StorableType = intrinsicTypeFor(SliceType); |
| DataSlice = IRB.CreateBitCast(DataSlice, StorableType, |
| DataSlice->getName() + ".storable"); |
| auto *NewSI = cast<StoreInst>(OrigSI.clone()); |
| NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset)); |
| IRB.Insert(NewSI); |
| NewSI->setOperand(0, DataSlice); |
| NewSI->setOperand(1, NewPtr); |
| NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL)); |
| } |
| return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false); |
| } |
| |
| bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) { |
| if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) |
| return false; |
| IRB.SetInsertPoint(&SI); |
| SmallVector<uint32_t> AggIdxs; |
| Value *OrigData = SI.getValueOperand(); |
| auto [Changed, ModifiedInPlace] = |
| visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName()); |
| if (Changed && !ModifiedInPlace) |
| SI.eraseFromParent(); |
| return Changed; |
| } |
| |
| bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) { |
| bool Changed = false; |
| // Note, memory transfer intrinsics won't |
| for (Instruction &I : make_early_inc_range(instructions(F))) { |
| Changed |= visit(I); |
| } |
| return Changed; |
| } |
| |
| /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered |
| /// buffer fat pointer constant. |
| static std::pair<Constant *, Constant *> |
| splitLoweredFatBufferConst(Constant *C) { |
| assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer"); |
| return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u)); |
| } |
| |
| namespace { |
| /// Handle the remapping of ptr addrspace(7) constants. |
| class FatPtrConstMaterializer final : public ValueMaterializer { |
| BufferFatPtrToStructTypeMap *TypeMap; |
| // An internal mapper that is used to recurse into the arguments of constants. |
| // While the documentation for `ValueMapper` specifies not to use it |
| // recursively, examination of the logic in mapValue() shows that it can |
| // safely be used recursively when handling constants, like it does in its own |
| // logic. |
| ValueMapper InternalMapper; |
| |
| Constant *materializeBufferFatPtrConst(Constant *C); |
| |
| public: |
| // UnderlyingMap is the value map this materializer will be filling. |
| FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, |
| ValueToValueMapTy &UnderlyingMap) |
| : TypeMap(TypeMap), |
| InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {} |
| virtual ~FatPtrConstMaterializer() = default; |
| |
| Value *materialize(Value *V) override; |
| }; |
| } // namespace |
| |
| Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { |
| Type *SrcTy = C->getType(); |
| auto *NewTy = dyn_cast<StructType>(TypeMap->remapType(SrcTy)); |
| if (C->isNullValue()) |
| return ConstantAggregateZero::getNullValue(NewTy); |
| if (isa<PoisonValue>(C)) { |
| return ConstantStruct::get(NewTy, |
| {PoisonValue::get(NewTy->getElementType(0)), |
| PoisonValue::get(NewTy->getElementType(1))}); |
| } |
| if (isa<UndefValue>(C)) { |
| return ConstantStruct::get(NewTy, |
| {UndefValue::get(NewTy->getElementType(0)), |
| UndefValue::get(NewTy->getElementType(1))}); |
| } |
| |
| if (auto *VC = dyn_cast<ConstantVector>(C)) { |
| if (Constant *S = VC->getSplatValue()) { |
| Constant *NewS = InternalMapper.mapConstant(*S); |
| if (!NewS) |
| return nullptr; |
| auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); |
| auto EC = VC->getType()->getElementCount(); |
| return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), |
| ConstantVector::getSplat(EC, Off)}); |
| } |
| SmallVector<Constant *> Rsrcs; |
| SmallVector<Constant *> Offs; |
| for (Value *Op : VC->operand_values()) { |
| auto *NewOp = dyn_cast_or_null<Constant>(InternalMapper.mapValue(*Op)); |
| if (!NewOp) |
| return nullptr; |
| auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); |
| Rsrcs.push_back(Rsrc); |
| Offs.push_back(Off); |
| } |
| Constant *RsrcVec = ConstantVector::get(Rsrcs); |
| Constant *OffVec = ConstantVector::get(Offs); |
| return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); |
| } |
| |
| if (isa<GlobalValue>(C)) |
| report_fatal_error("Global values containing ptr addrspace(7) (buffer " |
| "fat pointer) values are not supported"); |
| |
| if (isa<ConstantExpr>(C)) |
| report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer " |
| "fat pointer) values should have been expanded earlier"); |
| |
| return nullptr; |
| } |
| |
| Value *FatPtrConstMaterializer::materialize(Value *V) { |
| Constant *C = dyn_cast<Constant>(V); |
| if (!C) |
| return nullptr; |
| // Structs and other types that happen to contain fat pointers get remapped |
| // by the mapValue() logic. |
| if (!isBufferFatPtrConst(C)) |
| return nullptr; |
| return materializeBufferFatPtrConst(C); |
| } |
| |
| using PtrParts = std::pair<Value *, Value *>; |
| namespace { |
| // The visitor returns the resource and offset parts for an instruction if they |
| // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful |
| // value mapping. |
| class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> { |
| ValueToValueMapTy RsrcParts; |
| ValueToValueMapTy OffParts; |
| |
| // Track instructions that have been rewritten into a user of the component |
| // parts of their ptr addrspace(7) input. Instructions that produced |
| // ptr addrspace(7) parts should **not** be RAUW'd before being added to this |
| // set, as that replacement will be handled in a post-visit step. However, |
| // instructions that yield values that aren't fat pointers (ex. ptrtoint) |
| // should RAUW themselves with new instructions that use the split parts |
| // of their arguments during processing. |
| DenseSet<Instruction *> SplitUsers; |
| |
| // Nodes that need a second look once we've computed the parts for all other |
| // instructions to see if, for example, we really need to phi on the resource |
| // part. |
| SmallVector<Instruction *> Conditionals; |
| // Temporary instructions produced while lowering conditionals that should be |
| // killed. |
| SmallVector<Instruction *> ConditionalTemps; |
| |
| // Subtarget info, needed for determining what cache control bits to set. |
| const TargetMachine *TM; |
| const GCNSubtarget *ST = nullptr; |
| |
| IRBuilder<InstSimplifyFolder> IRB; |
| |
| // Copy metadata between instructions if applicable. |
| void copyMetadata(Value *Dest, Value *Src); |
| |
| // Get the resource and offset parts of the value V, inserting appropriate |
| // extractvalue calls if needed. |
| PtrParts getPtrParts(Value *V); |
| |
| // Given an instruction that could produce multiple resource parts (a PHI or |
| // select), collect the set of possible instructions that could have provided |
| // its resource parts that it could have (the `Roots`) and the set of |
| // conditional instructions visited during the search (`Seen`). If, after |
| // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset |
| // of `Roots` and `Roots - Seen` contains one element, the resource part of |
| // that element can replace the resource part of all other elements in `Seen`. |
| void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots, |
| SmallPtrSetImpl<Value *> &Seen); |
| void processConditionals(); |
| |
| // If an instruction hav been split into resource and offset parts, |
| // delete that instruction. If any of its uses have not themselves been split |
| // into parts (for example, an insertvalue), construct the structure |
| // that the type rewrites declared should be produced by the dying instruction |
| // and use that. |
| // Also, kill the temporary extractvalue operations produced by the two-stage |
| // lowering of PHIs and conditionals. |
| void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs); |
| |
| void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx); |
| void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); |
| void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); |
| Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, |
| Align Alignment, AtomicOrdering Order, |
| bool IsVolatile, SyncScope::ID SSID); |
| |
| public: |
| SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx, |
| const TargetMachine *TM) |
| : TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {} |
| |
| void processFunction(Function &F); |
| |
| PtrParts visitInstruction(Instruction &I); |
| PtrParts visitLoadInst(LoadInst &LI); |
| PtrParts visitStoreInst(StoreInst &SI); |
| PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); |
| PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); |
| PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); |
| |
| PtrParts visitPtrToIntInst(PtrToIntInst &PI); |
| PtrParts visitIntToPtrInst(IntToPtrInst &IP); |
| PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); |
| PtrParts visitICmpInst(ICmpInst &Cmp); |
| PtrParts visitFreezeInst(FreezeInst &I); |
| |
| PtrParts visitExtractElementInst(ExtractElementInst &I); |
| PtrParts visitInsertElementInst(InsertElementInst &I); |
| PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); |
| |
| PtrParts visitPHINode(PHINode &PHI); |
| PtrParts visitSelectInst(SelectInst &SI); |
| |
| PtrParts visitIntrinsicInst(IntrinsicInst &II); |
| }; |
| } // namespace |
| |
| void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { |
| auto *DestI = dyn_cast<Instruction>(Dest); |
| auto *SrcI = dyn_cast<Instruction>(Src); |
| |
| if (!DestI || !SrcI) |
| return; |
| |
| DestI->copyMetadata(*SrcI); |
| } |
| |
| PtrParts SplitPtrStructs::getPtrParts(Value *V) { |
| assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " |
| "of something that wasn't rewritten"); |
| auto *RsrcEntry = &RsrcParts[V]; |
| auto *OffEntry = &OffParts[V]; |
| if (*RsrcEntry && *OffEntry) |
| return {*RsrcEntry, *OffEntry}; |
| |
| if (auto *C = dyn_cast<Constant>(V)) { |
| auto [Rsrc, Off] = splitLoweredFatBufferConst(C); |
| return {*RsrcEntry = Rsrc, *OffEntry = Off}; |
| } |
| |
| IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB); |
| if (auto *I = dyn_cast<Instruction>(V)) { |
| LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); |
| auto [Rsrc, Off] = visit(*I); |
| if (Rsrc && Off) |
| return {*RsrcEntry = Rsrc, *OffEntry = Off}; |
| // We'll be creating the new values after the relevant instruction. |
| // This instruction generates a value and so isn't a terminator. |
| IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); |
| IRB.SetCurrentDebugLocation(I->getDebugLoc()); |
| } else if (auto *A = dyn_cast<Argument>(V)) { |
| IRB.SetInsertPointPastAllocas(A->getParent()); |
| IRB.SetCurrentDebugLocation(DebugLoc()); |
| } |
| Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); |
| Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); |
| return {*RsrcEntry = Rsrc, *OffEntry = Off}; |
| } |
| |
| /// Returns the instruction that defines the resource part of the value V. |
| /// Note that this is not getUnderlyingObject(), since that looks through |
| /// operations like ptrmask which might modify the resource part. |
| /// |
| /// We can limit ourselves to just looking through GEPs followed by looking |
| /// through addrspacecasts because only those two operations preserve the |
| /// resource part, and because operations on an `addrspace(8)` (which is the |
| /// legal input to this addrspacecast) would produce a different resource part. |
| static Value *rsrcPartRoot(Value *V) { |
| while (auto *GEP = dyn_cast<GEPOperator>(V)) |
| V = GEP->getPointerOperand(); |
| while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(V)) |
| V = ASC->getPointerOperand(); |
| return V; |
| } |
| |
| void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, |
| SmallPtrSetImpl<Value *> &Roots, |
| SmallPtrSetImpl<Value *> &Seen) { |
| if (auto *PHI = dyn_cast<PHINode>(I)) { |
| if (!Seen.insert(I).second) |
| return; |
| for (Value *In : PHI->incoming_values()) { |
| In = rsrcPartRoot(In); |
| Roots.insert(In); |
| if (isa<PHINode, SelectInst>(In)) |
| getPossibleRsrcRoots(cast<Instruction>(In), Roots, Seen); |
| } |
| } else if (auto *SI = dyn_cast<SelectInst>(I)) { |
| if (!Seen.insert(SI).second) |
| return; |
| Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); |
| Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); |
| Roots.insert(TrueVal); |
| Roots.insert(FalseVal); |
| if (isa<PHINode, SelectInst>(TrueVal)) |
| getPossibleRsrcRoots(cast<Instruction>(TrueVal), Roots, Seen); |
| if (isa<PHINode, SelectInst>(FalseVal)) |
| getPossibleRsrcRoots(cast<Instruction>(FalseVal), Roots, Seen); |
| } else { |
| llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); |
| } |
| } |
| |
| void SplitPtrStructs::processConditionals() { |
| SmallDenseMap<Value *, Value *> FoundRsrcs; |
| SmallPtrSet<Value *, 4> Roots; |
| SmallPtrSet<Value *, 4> Seen; |
| for (Instruction *I : Conditionals) { |
| // These have to exist by now because we've visited these nodes. |
| Value *Rsrc = RsrcParts[I]; |
| Value *Off = OffParts[I]; |
| assert(Rsrc && Off && "must have visited conditionals by now"); |
| |
| std::optional<Value *> MaybeRsrc; |
| auto MaybeFoundRsrc = FoundRsrcs.find(I); |
| if (MaybeFoundRsrc != FoundRsrcs.end()) { |
| MaybeRsrc = MaybeFoundRsrc->second; |
| } else { |
| IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB); |
| Roots.clear(); |
| Seen.clear(); |
| getPossibleRsrcRoots(I, Roots, Seen); |
| LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n"); |
| #ifndef NDEBUG |
| for (Value *V : Roots) |
| LLVM_DEBUG(dbgs() << "Root: " << *V << "\n"); |
| for (Value *V : Seen) |
| LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n"); |
| #endif |
| // If we are our own possible root, then we shouldn't block our |
| // replacement with a valid incoming value. |
| Roots.erase(I); |
| // We don't want to block the optimization for conditionals that don't |
| // refer to themselves but did see themselves during the traversal. |
| Seen.erase(I); |
| |
| if (set_is_subset(Seen, Roots)) { |
| auto Diff = set_difference(Roots, Seen); |
| if (Diff.size() == 1) { |
| Value *RootVal = *Diff.begin(); |
| // Handle the case where previous loops already looked through |
| // an addrspacecast. |
| if (isSplitFatPtr(RootVal->getType())) |
| MaybeRsrc = std::get<0>(getPtrParts(RootVal)); |
| else |
| MaybeRsrc = RootVal; |
| } |
| } |
| } |
| |
| if (auto *PHI = dyn_cast<PHINode>(I)) { |
| Value *NewRsrc; |
| StructType *PHITy = cast<StructType>(PHI->getType()); |
| IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef()); |
| IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); |
| if (MaybeRsrc) { |
| NewRsrc = *MaybeRsrc; |
| } else { |
| Type *RsrcTy = PHITy->getElementType(0); |
| auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); |
| RsrcPHI->takeName(Rsrc); |
| for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { |
| Value *VRsrc = std::get<0>(getPtrParts(V)); |
| RsrcPHI->addIncoming(VRsrc, BB); |
| } |
| copyMetadata(RsrcPHI, PHI); |
| NewRsrc = RsrcPHI; |
| } |
| |
| Type *OffTy = PHITy->getElementType(1); |
| auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); |
| NewOff->takeName(Off); |
| for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { |
| assert(OffParts.count(V) && "An offset part had to be created by now"); |
| Value *VOff = std::get<1>(getPtrParts(V)); |
| NewOff->addIncoming(VOff, BB); |
| } |
| copyMetadata(NewOff, PHI); |
| |
| // Note: We don't eraseFromParent() the temporaries because we don't want |
| // to put the corrections maps in an inconstent state. That'll be handed |
| // during the rest of the killing. Also, `ValueToValueMapTy` guarantees |
| // that references in that map will be updated as well. |
| // Note that if the temporary instruction got `InstSimplify`'d away, it |
| // might be something like a block argument. |
| if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { |
| ConditionalTemps.push_back(RsrcInst); |
| RsrcInst->replaceAllUsesWith(NewRsrc); |
| } |
| if (auto *OffInst = dyn_cast<Instruction>(Off)) { |
| ConditionalTemps.push_back(OffInst); |
| OffInst->replaceAllUsesWith(NewOff); |
| } |
| |
| // Save on recomputing the cycle traversals in known-root cases. |
| if (MaybeRsrc) |
| for (Value *V : Seen) |
| FoundRsrcs[V] = NewRsrc; |
| } else if (isa<SelectInst>(I)) { |
| if (MaybeRsrc) { |
| if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { |
| ConditionalTemps.push_back(RsrcInst); |
| RsrcInst->replaceAllUsesWith(*MaybeRsrc); |
| } |
| for (Value *V : Seen) |
| FoundRsrcs[V] = *MaybeRsrc; |
| } |
| } else { |
| llvm_unreachable("Only PHIs and selects go in the conditionals list"); |
| } |
| } |
| } |
| |
| void SplitPtrStructs::killAndReplaceSplitInstructions( |
| SmallVectorImpl<Instruction *> &Origs) { |
| for (Instruction *I : ConditionalTemps) |
| I->eraseFromParent(); |
| |
| for (Instruction *I : Origs) { |
| if (!SplitUsers.contains(I)) |
| continue; |
| |
| SmallVector<DbgValueInst *> Dbgs; |
| findDbgValues(Dbgs, I); |
| for (auto *Dbg : Dbgs) { |
| IRB.SetInsertPoint(Dbg); |
| auto &DL = I->getDataLayout(); |
| assert(isSplitFatPtr(I->getType()) && |
| "We should've RAUW'd away loads, stores, etc. at this point"); |
| auto *OffDbg = cast<DbgValueInst>(Dbg->clone()); |
| copyMetadata(OffDbg, Dbg); |
| auto [Rsrc, Off] = getPtrParts(I); |
| |
| int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); |
| int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); |
| |
| std::optional<DIExpression *> RsrcExpr = |
| DIExpression::createFragmentExpression(Dbg->getExpression(), 0, |
| RsrcSz); |
| std::optional<DIExpression *> OffExpr = |
| DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, |
| OffSz); |
| if (OffExpr) { |
| OffDbg->setExpression(*OffExpr); |
| OffDbg->replaceVariableLocationOp(I, Off); |
| IRB.Insert(OffDbg); |
| } else { |
| OffDbg->deleteValue(); |
| } |
| if (RsrcExpr) { |
| Dbg->setExpression(*RsrcExpr); |
| Dbg->replaceVariableLocationOp(I, Rsrc); |
| } else { |
| Dbg->replaceVariableLocationOp(I, PoisonValue::get(I->getType())); |
| } |
| } |
| |
| Value *Poison = PoisonValue::get(I->getType()); |
| I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { |
| if (const auto *UI = dyn_cast<Instruction>(U.getUser())) |
| return SplitUsers.contains(UI); |
| return false; |
| }); |
| |
| if (I->use_empty()) { |
| I->eraseFromParent(); |
| continue; |
| } |
| IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); |
| IRB.SetCurrentDebugLocation(I->getDebugLoc()); |
| auto [Rsrc, Off] = getPtrParts(I); |
| Value *Struct = PoisonValue::get(I->getType()); |
| Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); |
| Struct = IRB.CreateInsertValue(Struct, Off, 1); |
| copyMetadata(Struct, I); |
| Struct->takeName(I); |
| I->replaceAllUsesWith(Struct); |
| I->eraseFromParent(); |
| } |
| } |
| |
| void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { |
| LLVMContext &Ctx = Intr->getContext(); |
| Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A)); |
| } |
| |
| void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, |
| SyncScope::ID SSID) { |
| switch (Order) { |
| case AtomicOrdering::Release: |
| case AtomicOrdering::AcquireRelease: |
| case AtomicOrdering::SequentiallyConsistent: |
| IRB.CreateFence(AtomicOrdering::Release, SSID); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, |
| SyncScope::ID SSID) { |
| switch (Order) { |
| case AtomicOrdering::Acquire: |
| case AtomicOrdering::AcquireRelease: |
| case AtomicOrdering::SequentiallyConsistent: |
| IRB.CreateFence(AtomicOrdering::Acquire, SSID); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, |
| Type *Ty, Align Alignment, |
| AtomicOrdering Order, bool IsVolatile, |
| SyncScope::ID SSID) { |
| IRB.SetInsertPoint(I); |
| |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| SmallVector<Value *, 5> Args; |
| if (Arg) |
| Args.push_back(Arg); |
| Args.push_back(Rsrc); |
| Args.push_back(Off); |
| insertPreMemOpFence(Order, SSID); |
| // soffset is always 0 for these cases, where we always want any offset to be |
| // part of bounds checking and we don't know which parts of the GEPs is |
| // uniform. |
| Args.push_back(IRB.getInt32(0)); |
| |
| uint32_t Aux = 0; |
| if (IsVolatile) |
| Aux |= AMDGPU::CPol::VOLATILE; |
| Args.push_back(IRB.getInt32(Aux)); |
| |
| Intrinsic::ID IID = Intrinsic::not_intrinsic; |
| if (isa<LoadInst>(I)) |
| IID = Order == AtomicOrdering::NotAtomic |
| ? Intrinsic::amdgcn_raw_ptr_buffer_load |
| : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load; |
| else if (isa<StoreInst>(I)) |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_store; |
| else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) { |
| switch (RMW->getOperation()) { |
| case AtomicRMWInst::Xchg: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; |
| break; |
| case AtomicRMWInst::Add: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; |
| break; |
| case AtomicRMWInst::Sub: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; |
| break; |
| case AtomicRMWInst::And: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; |
| break; |
| case AtomicRMWInst::Or: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; |
| break; |
| case AtomicRMWInst::Xor: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; |
| break; |
| case AtomicRMWInst::Max: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; |
| break; |
| case AtomicRMWInst::Min: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; |
| break; |
| case AtomicRMWInst::UMax: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; |
| break; |
| case AtomicRMWInst::UMin: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; |
| break; |
| case AtomicRMWInst::FAdd: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; |
| break; |
| case AtomicRMWInst::FMax: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; |
| break; |
| case AtomicRMWInst::FMin: |
| IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; |
| break; |
| case AtomicRMWInst::FSub: { |
| report_fatal_error("atomic floating point subtraction not supported for " |
| "buffer resources and should've been expanded away"); |
| break; |
| } |
| case AtomicRMWInst::Nand: |
| report_fatal_error("atomic nand not supported for buffer resources and " |
| "should've been expanded away"); |
| break; |
| case AtomicRMWInst::UIncWrap: |
| case AtomicRMWInst::UDecWrap: |
| report_fatal_error("wrapping increment/decrement not supported for " |
| "buffer resources and should've ben expanded away"); |
| break; |
| case AtomicRMWInst::BAD_BINOP: |
| llvm_unreachable("Not sure how we got a bad binop"); |
| case AtomicRMWInst::USubCond: |
| case AtomicRMWInst::USubSat: |
| break; |
| } |
| } |
| |
| auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); |
| copyMetadata(Call, I); |
| setAlign(Call, Alignment, Arg ? 1 : 0); |
| Call->takeName(I); |
| |
| insertPostMemOpFence(Order, SSID); |
| // The "no moving p7 directly" rewrites ensure that this load or store won't |
| // itself need to be split into parts. |
| SplitUsers.insert(I); |
| I->replaceAllUsesWith(Call); |
| return Call; |
| } |
| |
| PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { |
| if (!isSplitFatPtr(LI.getPointerOperandType())) |
| return {nullptr, nullptr}; |
| handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), |
| LI.getAlign(), LI.getOrdering(), LI.isVolatile(), |
| LI.getSyncScopeID()); |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { |
| if (!isSplitFatPtr(SI.getPointerOperandType())) |
| return {nullptr, nullptr}; |
| Value *Arg = SI.getValueOperand(); |
| handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), |
| SI.getAlign(), SI.getOrdering(), SI.isVolatile(), |
| SI.getSyncScopeID()); |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { |
| if (!isSplitFatPtr(AI.getPointerOperand()->getType())) |
| return {nullptr, nullptr}; |
| Value *Arg = AI.getValOperand(); |
| handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), |
| AI.getAlign(), AI.getOrdering(), AI.isVolatile(), |
| AI.getSyncScopeID()); |
| return {nullptr, nullptr}; |
| } |
| |
| // Unlike load, store, and RMW, cmpxchg needs special handling to account |
| // for the boolean argument. |
| PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { |
| Value *Ptr = AI.getPointerOperand(); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&AI); |
| |
| Type *Ty = AI.getNewValOperand()->getType(); |
| AtomicOrdering Order = AI.getMergedOrdering(); |
| SyncScope::ID SSID = AI.getSyncScopeID(); |
| bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); |
| |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| insertPreMemOpFence(Order, SSID); |
| |
| uint32_t Aux = 0; |
| if (IsNonTemporal) |
| Aux |= AMDGPU::CPol::SLC; |
| if (AI.isVolatile()) |
| Aux |= AMDGPU::CPol::VOLATILE; |
| auto *Call = |
| IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, |
| {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, |
| Off, IRB.getInt32(0), IRB.getInt32(Aux)}); |
| copyMetadata(Call, &AI); |
| setAlign(Call, AI.getAlign(), 2); |
| Call->takeName(&AI); |
| insertPostMemOpFence(Order, SSID); |
| |
| Value *Res = PoisonValue::get(AI.getType()); |
| Res = IRB.CreateInsertValue(Res, Call, 0); |
| if (!AI.isWeak()) { |
| Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); |
| Res = IRB.CreateInsertValue(Res, Succeeded, 1); |
| } |
| SplitUsers.insert(&AI); |
| AI.replaceAllUsesWith(Res); |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { |
| using namespace llvm::PatternMatch; |
| Value *Ptr = GEP.getPointerOperand(); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&GEP); |
| |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| const DataLayout &DL = GEP.getDataLayout(); |
| bool IsNUW = GEP.hasNoUnsignedWrap(); |
| bool IsNUSW = GEP.hasNoUnsignedSignedWrap(); |
| |
| StructType *ResTy = cast<StructType>(GEP.getType()); |
| Type *ResRsrcTy = ResTy->getElementType(0); |
| VectorType *ResRsrcVecTy = dyn_cast<VectorType>(ResRsrcTy); |
| bool BroadcastsPtr = ResRsrcVecTy && !isa<VectorType>(Off->getType()); |
| |
| // In order to call emitGEPOffset() and thus not have to reimplement it, |
| // we need the GEP result to have ptr addrspace(7) type. |
| Type *FatPtrTy = |
| ResRsrcTy->getWithNewType(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER)); |
| GEP.mutateType(FatPtrTy); |
| Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); |
| GEP.mutateType(ResTy); |
| |
| if (BroadcastsPtr) { |
| Rsrc = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Rsrc, |
| Rsrc->getName()); |
| Off = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Off, |
| Off->getName()); |
| } |
| if (match(OffAccum, m_Zero())) { // Constant-zero offset |
| SplitUsers.insert(&GEP); |
| return {Rsrc, Off}; |
| } |
| |
| bool HasNonNegativeOff = false; |
| if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { |
| HasNonNegativeOff = !CI->isNegative(); |
| } |
| Value *NewOff; |
| if (match(Off, m_Zero())) { |
| NewOff = OffAccum; |
| } else { |
| NewOff = IRB.CreateAdd(Off, OffAccum, "", |
| /*hasNUW=*/IsNUW || (IsNUSW && HasNonNegativeOff), |
| /*hasNSW=*/false); |
| } |
| copyMetadata(NewOff, &GEP); |
| NewOff->takeName(&GEP); |
| SplitUsers.insert(&GEP); |
| return {Rsrc, NewOff}; |
| } |
| |
| PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { |
| Value *Ptr = PI.getPointerOperand(); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&PI); |
| |
| Type *ResTy = PI.getType(); |
| unsigned Width = ResTy->getScalarSizeInBits(); |
| |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| const DataLayout &DL = PI.getDataLayout(); |
| unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); |
| |
| Value *Res; |
| if (Width <= BufferOffsetWidth) { |
| Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, |
| PI.getName() + ".off"); |
| } else { |
| Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); |
| Value *Shl = IRB.CreateShl( |
| RsrcInt, |
| ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), |
| "", Width >= FatPtrWidth, Width > FatPtrWidth); |
| Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, |
| PI.getName() + ".off"); |
| Res = IRB.CreateOr(Shl, OffCast); |
| } |
| |
| copyMetadata(Res, &PI); |
| Res->takeName(&PI); |
| SplitUsers.insert(&PI); |
| PI.replaceAllUsesWith(Res); |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { |
| if (!isSplitFatPtr(IP.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&IP); |
| const DataLayout &DL = IP.getDataLayout(); |
| unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); |
| Value *Int = IP.getOperand(0); |
| Type *IntTy = Int->getType(); |
| Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); |
| unsigned Width = IntTy->getScalarSizeInBits(); |
| |
| auto *RetTy = cast<StructType>(IP.getType()); |
| Type *RsrcTy = RetTy->getElementType(0); |
| Type *OffTy = RetTy->getElementType(1); |
| Value *RsrcPart = IRB.CreateLShr( |
| Int, |
| ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); |
| Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); |
| Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); |
| Value *Off = |
| IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); |
| |
| copyMetadata(Rsrc, &IP); |
| SplitUsers.insert(&IP); |
| return {Rsrc, Off}; |
| } |
| |
| PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { |
| if (!isSplitFatPtr(I.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| Value *In = I.getPointerOperand(); |
| // No-op casts preserve parts |
| if (In->getType() == I.getType()) { |
| auto [Rsrc, Off] = getPtrParts(In); |
| SplitUsers.insert(&I); |
| return {Rsrc, Off}; |
| } |
| if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) |
| report_fatal_error("Only buffer resources (addrspace 8) can be cast to " |
| "buffer fat pointers (addrspace 7)"); |
| Type *OffTy = cast<StructType>(I.getType())->getElementType(1); |
| Value *ZeroOff = Constant::getNullValue(OffTy); |
| SplitUsers.insert(&I); |
| return {In, ZeroOff}; |
| } |
| |
| PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { |
| Value *Lhs = Cmp.getOperand(0); |
| if (!isSplitFatPtr(Lhs->getType())) |
| return {nullptr, nullptr}; |
| Value *Rhs = Cmp.getOperand(1); |
| IRB.SetInsertPoint(&Cmp); |
| ICmpInst::Predicate Pred = Cmp.getPredicate(); |
| |
| assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && |
| "Pointer comparison is only equal or unequal"); |
| auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); |
| auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); |
| Value *RsrcCmp = |
| IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); |
| copyMetadata(RsrcCmp, &Cmp); |
| Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); |
| copyMetadata(OffCmp, &Cmp); |
| |
| Value *Res = nullptr; |
| if (Pred == ICmpInst::ICMP_EQ) |
| Res = IRB.CreateAnd(RsrcCmp, OffCmp); |
| else if (Pred == ICmpInst::ICMP_NE) |
| Res = IRB.CreateOr(RsrcCmp, OffCmp); |
| copyMetadata(Res, &Cmp); |
| Res->takeName(&Cmp); |
| SplitUsers.insert(&Cmp); |
| Cmp.replaceAllUsesWith(Res); |
| return {nullptr, nullptr}; |
| } |
| |
| PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { |
| if (!isSplitFatPtr(I.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); |
| |
| Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); |
| copyMetadata(RsrcRes, &I); |
| Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); |
| copyMetadata(OffRes, &I); |
| SplitUsers.insert(&I); |
| return {RsrcRes, OffRes}; |
| } |
| |
| PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { |
| if (!isSplitFatPtr(I.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| Value *Vec = I.getVectorOperand(); |
| Value *Idx = I.getIndexOperand(); |
| auto [Rsrc, Off] = getPtrParts(Vec); |
| |
| Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); |
| copyMetadata(RsrcRes, &I); |
| Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); |
| copyMetadata(OffRes, &I); |
| SplitUsers.insert(&I); |
| return {RsrcRes, OffRes}; |
| } |
| |
| PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { |
| // The mutated instructions temporarily don't return vectors, and so |
| // we need the generic getType() here to avoid crashes. |
| if (!isSplitFatPtr(cast<Instruction>(I).getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| Value *Vec = I.getOperand(0); |
| Value *Elem = I.getOperand(1); |
| Value *Idx = I.getOperand(2); |
| auto [VecRsrc, VecOff] = getPtrParts(Vec); |
| auto [ElemRsrc, ElemOff] = getPtrParts(Elem); |
| |
| Value *RsrcRes = |
| IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); |
| copyMetadata(RsrcRes, &I); |
| Value *OffRes = |
| IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); |
| copyMetadata(OffRes, &I); |
| SplitUsers.insert(&I); |
| return {RsrcRes, OffRes}; |
| } |
| |
| PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { |
| // Cast is needed for the same reason as insertelement's. |
| if (!isSplitFatPtr(cast<Instruction>(I).getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| |
| Value *V1 = I.getOperand(0); |
| Value *V2 = I.getOperand(1); |
| ArrayRef<int> Mask = I.getShuffleMask(); |
| auto [V1Rsrc, V1Off] = getPtrParts(V1); |
| auto [V2Rsrc, V2Off] = getPtrParts(V2); |
| |
| Value *RsrcRes = |
| IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); |
| copyMetadata(RsrcRes, &I); |
| Value *OffRes = |
| IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); |
| copyMetadata(OffRes, &I); |
| SplitUsers.insert(&I); |
| return {RsrcRes, OffRes}; |
| } |
| |
| PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { |
| if (!isSplitFatPtr(PHI.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef()); |
| // Phi nodes will be handled in post-processing after we've visited every |
| // instruction. However, instead of just returning {nullptr, nullptr}, |
| // we explicitly create the temporary extractvalue operations that are our |
| // temporary results so that they end up at the beginning of the block with |
| // the PHIs. |
| Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); |
| Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); |
| Conditionals.push_back(&PHI); |
| SplitUsers.insert(&PHI); |
| return {TmpRsrc, TmpOff}; |
| } |
| |
| PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { |
| if (!isSplitFatPtr(SI.getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&SI); |
| |
| Value *Cond = SI.getCondition(); |
| Value *True = SI.getTrueValue(); |
| Value *False = SI.getFalseValue(); |
| auto [TrueRsrc, TrueOff] = getPtrParts(True); |
| auto [FalseRsrc, FalseOff] = getPtrParts(False); |
| |
| Value *RsrcRes = |
| IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); |
| copyMetadata(RsrcRes, &SI); |
| Conditionals.push_back(&SI); |
| Value *OffRes = |
| IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); |
| copyMetadata(OffRes, &SI); |
| SplitUsers.insert(&SI); |
| return {RsrcRes, OffRes}; |
| } |
| |
| /// Returns true if this intrinsic needs to be removed when it is |
| /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are |
| /// rewritten into calls to versions of that intrinsic on the resource |
| /// descriptor. |
| static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { |
| switch (IID) { |
| default: |
| return false; |
| case Intrinsic::amdgcn_make_buffer_rsrc: |
| case Intrinsic::ptrmask: |
| case Intrinsic::invariant_start: |
| case Intrinsic::invariant_end: |
| case Intrinsic::launder_invariant_group: |
| case Intrinsic::strip_invariant_group: |
| case Intrinsic::memcpy: |
| case Intrinsic::memcpy_inline: |
| case Intrinsic::memmove: |
| case Intrinsic::memset: |
| case Intrinsic::memset_inline: |
| case Intrinsic::experimental_memset_pattern: |
| return true; |
| } |
| } |
| |
| PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { |
| Intrinsic::ID IID = I.getIntrinsicID(); |
| switch (IID) { |
| default: |
| break; |
| case Intrinsic::amdgcn_make_buffer_rsrc: { |
| if (!isSplitFatPtr(I.getType())) |
| return {nullptr, nullptr}; |
| Value *Base = I.getArgOperand(0); |
| Value *Stride = I.getArgOperand(1); |
| Value *NumRecords = I.getArgOperand(2); |
| Value *Flags = I.getArgOperand(3); |
| auto *SplitType = cast<StructType>(I.getType()); |
| Type *RsrcType = SplitType->getElementType(0); |
| Type *OffType = SplitType->getElementType(1); |
| IRB.SetInsertPoint(&I); |
| Value *Rsrc = IRB.CreateIntrinsic(IID, {RsrcType, Base->getType()}, |
| {Base, Stride, NumRecords, Flags}); |
| copyMetadata(Rsrc, &I); |
| Rsrc->takeName(&I); |
| Value *Zero = Constant::getNullValue(OffType); |
| SplitUsers.insert(&I); |
| return {Rsrc, Zero}; |
| } |
| case Intrinsic::ptrmask: { |
| Value *Ptr = I.getArgOperand(0); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| Value *Mask = I.getArgOperand(1); |
| IRB.SetInsertPoint(&I); |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| if (Mask->getType() != Off->getType()) |
| report_fatal_error("offset width is not equal to index width of fat " |
| "pointer (data layout not set up correctly?)"); |
| Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off"); |
| copyMetadata(OffRes, &I); |
| SplitUsers.insert(&I); |
| return {Rsrc, OffRes}; |
| } |
| // Pointer annotation intrinsics that, given their object-wide nature |
| // operate on the resource part. |
| case Intrinsic::invariant_start: { |
| Value *Ptr = I.getArgOperand(1); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); |
| auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); |
| copyMetadata(NewRsrc, &I); |
| NewRsrc->takeName(&I); |
| SplitUsers.insert(&I); |
| I.replaceAllUsesWith(NewRsrc); |
| return {nullptr, nullptr}; |
| } |
| case Intrinsic::invariant_end: { |
| Value *RealPtr = I.getArgOperand(2); |
| if (!isSplitFatPtr(RealPtr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| Value *RealRsrc = getPtrParts(RealPtr).first; |
| Value *InvPtr = I.getArgOperand(0); |
| Value *Size = I.getArgOperand(1); |
| Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, |
| {InvPtr, Size, RealRsrc}); |
| copyMetadata(NewRsrc, &I); |
| NewRsrc->takeName(&I); |
| SplitUsers.insert(&I); |
| I.replaceAllUsesWith(NewRsrc); |
| return {nullptr, nullptr}; |
| } |
| case Intrinsic::launder_invariant_group: |
| case Intrinsic::strip_invariant_group: { |
| Value *Ptr = I.getArgOperand(0); |
| if (!isSplitFatPtr(Ptr->getType())) |
| return {nullptr, nullptr}; |
| IRB.SetInsertPoint(&I); |
| auto [Rsrc, Off] = getPtrParts(Ptr); |
| Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); |
| copyMetadata(NewRsrc, &I); |
| NewRsrc->takeName(&I); |
| SplitUsers.insert(&I); |
| return {NewRsrc, Off}; |
| } |
| } |
| return {nullptr, nullptr}; |
| } |
| |
| void SplitPtrStructs::processFunction(Function &F) { |
| ST = &TM->getSubtarget<GCNSubtarget>(F); |
| SmallVector<Instruction *, 0> Originals; |
| LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName() |
| << "\n"); |
| for (Instruction &I : instructions(F)) |
| Originals.push_back(&I); |
| for (Instruction *I : Originals) { |
| auto [Rsrc, Off] = visit(I); |
| assert(((Rsrc && Off) || (!Rsrc && !Off)) && |
| "Can't have a resource but no offset"); |
| if (Rsrc) |
| RsrcParts[I] = Rsrc; |
| if (Off) |
| OffParts[I] = Off; |
| } |
| processConditionals(); |
| killAndReplaceSplitInstructions(Originals); |
| |
| // Clean up after ourselves to save on memory. |
| RsrcParts.clear(); |
| OffParts.clear(); |
| SplitUsers.clear(); |
| Conditionals.clear(); |
| ConditionalTemps.clear(); |
| } |
| |
| namespace { |
| class AMDGPULowerBufferFatPointers : public ModulePass { |
| public: |
| static char ID; |
| |
| AMDGPULowerBufferFatPointers() : ModulePass(ID) { |
| initializeAMDGPULowerBufferFatPointersPass( |
| *PassRegistry::getPassRegistry()); |
| } |
| |
| bool run(Module &M, const TargetMachine &TM); |
| bool runOnModule(Module &M) override; |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override; |
| }; |
| } // namespace |
| |
| /// Returns true if there are values that have a buffer fat pointer in them, |
| /// which means we'll need to perform rewrites on this function. As a side |
| /// effect, this will populate the type remapping cache. |
| static bool containsBufferFatPointers(const Function &F, |
| BufferFatPtrToStructTypeMap *TypeMap) { |
| bool HasFatPointers = false; |
| for (const BasicBlock &BB : F) |
| for (const Instruction &I : BB) |
| HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); |
| return HasFatPointers; |
| } |
| |
| static bool hasFatPointerInterface(const Function &F, |
| BufferFatPtrToStructTypeMap *TypeMap) { |
| Type *Ty = F.getFunctionType(); |
| return Ty != TypeMap->remapType(Ty); |
| } |
| |
| /// Move the body of `OldF` into a new function, returning it. |
| static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, |
| ValueToValueMapTy &CloneMap) { |
| bool IsIntrinsic = OldF->isIntrinsic(); |
| Function *NewF = |
| Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); |
| NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat; |
| NewF->copyAttributesFrom(OldF); |
| NewF->copyMetadata(OldF, 0); |
| NewF->takeName(OldF); |
| NewF->updateAfterNameChange(); |
| NewF->setDLLStorageClass(OldF->getDLLStorageClass()); |
| OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); |
| |
| while (!OldF->empty()) { |
| BasicBlock *BB = &OldF->front(); |
| BB->removeFromParent(); |
| BB->insertInto(NewF); |
| CloneMap[BB] = BB; |
| for (Instruction &I : *BB) { |
| CloneMap[&I] = &I; |
| } |
| } |
| |
| SmallVector<AttributeSet> ArgAttrs; |
| AttributeList OldAttrs = OldF->getAttributes(); |
| |
| for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) { |
| CloneMap[&NewArg] = &OldArg; |
| NewArg.takeName(&OldArg); |
| Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); |
| // Temporarily mutate type of `NewArg` to allow RAUW to work. |
| NewArg.mutateType(OldArgTy); |
| OldArg.replaceAllUsesWith(&NewArg); |
| NewArg.mutateType(NewArgTy); |
| |
| AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); |
| // Intrinsics get their attributes fixed later. |
| if (OldArgTy != NewArgTy && !IsIntrinsic) |
| ArgAttr = ArgAttr.removeAttributes( |
| NewF->getContext(), |
| AttributeFuncs::typeIncompatible(NewArgTy, ArgAttr)); |
| ArgAttrs.push_back(ArgAttr); |
| } |
| AttributeSet RetAttrs = OldAttrs.getRetAttrs(); |
| if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) |
| RetAttrs = RetAttrs.removeAttributes( |
| NewF->getContext(), |
| AttributeFuncs::typeIncompatible(NewF->getReturnType(), RetAttrs)); |
| NewF->setAttributes(AttributeList::get( |
| NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); |
| return NewF; |
| } |
| |
| static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { |
| for (Argument &A : F->args()) |
| CloneMap[&A] = &A; |
| for (BasicBlock &BB : *F) { |
| CloneMap[&BB] = &BB; |
| for (Instruction &I : BB) |
| CloneMap[&I] = &I; |
| } |
| } |
| |
| bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { |
| bool Changed = false; |
| const DataLayout &DL = M.getDataLayout(); |
| // Record the functions which need to be remapped. |
| // The second element of the pair indicates whether the function has to have |
| // its arguments or return types adjusted. |
| SmallVector<std::pair<Function *, bool>> NeedsRemap; |
| |
| BufferFatPtrToStructTypeMap StructTM(DL); |
| BufferFatPtrToIntTypeMap IntTM(DL); |
| for (const GlobalVariable &GV : M.globals()) { |
| if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) |
| report_fatal_error("Global variables with a buffer fat pointer address " |
| "space (7) are not supported"); |
| Type *VT = GV.getValueType(); |
| if (VT != StructTM.remapType(VT)) |
| report_fatal_error("Global variables that contain buffer fat pointers " |
| "(address space 7 pointers) are unsupported. Use " |
| "buffer resource pointers (address space 8) instead."); |
| } |
| |
| { |
| // Collect all constant exprs and aggregates referenced by any function. |
| SmallVector<Constant *, 8> Worklist; |
| for (Function &F : M.functions()) |
| for (Instruction &I : instructions(F)) |
| for (Value *Op : I.operands()) |
| if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) |
| Worklist.push_back(cast<Constant>(Op)); |
| |
| // Recursively look for any referenced buffer pointer constants. |
| SmallPtrSet<Constant *, 8> Visited; |
| SetVector<Constant *> BufferFatPtrConsts; |
| while (!Worklist.empty()) { |
| Constant *C = Worklist.pop_back_val(); |
| if (!Visited.insert(C).second) |
| continue; |
| if (isBufferFatPtrOrVector(C->getType())) |
| BufferFatPtrConsts.insert(C); |
| for (Value *Op : C->operands()) |
| if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) |
| Worklist.push_back(cast<Constant>(Op)); |
| } |
| |
| // Expand all constant expressions using fat buffer pointers to |
| // instructions. |
| Changed |= convertUsersOfConstantsToInstructions( |
| BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr, |
| /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); |
| } |
| |
| StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL, |
| M.getContext(), &TM); |
| LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL, |
| M.getContext()); |
| for (Function &F : M.functions()) { |
| bool InterfaceChange = hasFatPointerInterface(F, &StructTM); |
| bool BodyChanges = containsBufferFatPointers(F, &StructTM); |
| Changed |= MemOpsRewrite.processFunction(F); |
| if (InterfaceChange || BodyChanges) { |
| NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); |
| Changed |= BufferContentsTypeRewrite.processFunction(F); |
| } |
| } |
| if (NeedsRemap.empty()) |
| return Changed; |
| |
| SmallVector<Function *> NeedsPostProcess; |
| SmallVector<Function *> Intrinsics; |
| // Keep one big map so as to memoize constants across functions. |
| ValueToValueMapTy CloneMap; |
| FatPtrConstMaterializer Materializer(&StructTM, CloneMap); |
| |
| ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); |
| for (auto [F, InterfaceChange] : NeedsRemap) { |
| Function *NewF = F; |
| if (InterfaceChange) |
| NewF = moveFunctionAdaptingType( |
| F, cast<FunctionType>(StructTM.remapType(F->getFunctionType())), |
| CloneMap); |
| else |
| makeCloneInPraceMap(F, CloneMap); |
| LowerInFuncs.remapFunction(*NewF); |
| if (NewF->isIntrinsic()) |
| Intrinsics.push_back(NewF); |
| else |
| NeedsPostProcess.push_back(NewF); |
| if (InterfaceChange) { |
| F->replaceAllUsesWith(NewF); |
| F->eraseFromParent(); |
| } |
| Changed = true; |
| } |
| StructTM.clear(); |
| IntTM.clear(); |
| CloneMap.clear(); |
| |
| SplitPtrStructs Splitter(DL, M.getContext(), &TM); |
| for (Function *F : NeedsPostProcess) |
| Splitter.processFunction(*F); |
| for (Function *F : Intrinsics) { |
| if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { |
| F->eraseFromParent(); |
| } else { |
| std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F); |
| if (NewF) |
| F->replaceAllUsesWith(*NewF); |
| } |
| } |
| return Changed; |
| } |
| |
| bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { |
| TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); |
| const TargetMachine &TM = TPC.getTM<TargetMachine>(); |
| return run(M, TM); |
| } |
| |
| char AMDGPULowerBufferFatPointers::ID = 0; |
| |
| char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; |
| |
| void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { |
| AU.addRequired<TargetPassConfig>(); |
| } |
| |
| #define PASS_DESC "Lower buffer fat pointer operations to buffer resources" |
| INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, |
| false, false) |
| INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
| INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, |
| false) |
| #undef PASS_DESC |
| |
| ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { |
| return new AMDGPULowerBufferFatPointers(); |
| } |
| |
| PreservedAnalyses |
| AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { |
| return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() |
| : PreservedAnalyses::all(); |
| } |