| //===- SIMemoryLegalizer.cpp ----------------------------------------------===// | 
 | // | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | /// \file | 
 | /// Memory legalizer - implements memory model. More information can be | 
 | /// found here: | 
 | ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "AMDGPU.h" | 
 | #include "AMDGPUMachineModuleInfo.h" | 
 | #include "GCNSubtarget.h" | 
 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | 
 | #include "llvm/ADT/BitmaskEnum.h" | 
 | #include "llvm/ADT/StringExtras.h" | 
 | #include "llvm/CodeGen/MachineBasicBlock.h" | 
 | #include "llvm/CodeGen/MachineFunctionPass.h" | 
 | #include "llvm/CodeGen/MachinePassManager.h" | 
 | #include "llvm/IR/DiagnosticInfo.h" | 
 | #include "llvm/IR/MemoryModelRelaxationAnnotations.h" | 
 | #include "llvm/IR/PassManager.h" | 
 | #include "llvm/Support/AtomicOrdering.h" | 
 | #include "llvm/TargetParser/TargetParser.h" | 
 |  | 
 | using namespace llvm; | 
 | using namespace llvm::AMDGPU; | 
 |  | 
 | #define DEBUG_TYPE "si-memory-legalizer" | 
 | #define PASS_NAME "SI Memory Legalizer" | 
 |  | 
 | static cl::opt<bool> AmdgcnSkipCacheInvalidations( | 
 |     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, | 
 |     cl::desc("Use this to skip inserting cache invalidating instructions.")); | 
 |  | 
 | namespace { | 
 |  | 
 | LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); | 
 |  | 
 | /// Memory operation flags. Can be ORed together. | 
 | enum class SIMemOp { | 
 |   NONE = 0u, | 
 |   LOAD = 1u << 0, | 
 |   STORE = 1u << 1, | 
 |   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) | 
 | }; | 
 |  | 
 | /// Position to insert a new instruction relative to an existing | 
 | /// instruction. | 
 | enum class Position { | 
 |   BEFORE, | 
 |   AFTER | 
 | }; | 
 |  | 
 | /// The atomic synchronization scopes supported by the AMDGPU target. | 
 | enum class SIAtomicScope { | 
 |   NONE, | 
 |   SINGLETHREAD, | 
 |   WAVEFRONT, | 
 |   WORKGROUP, | 
 |   CLUSTER, // Promoted to AGENT on targets without workgroup clusters. | 
 |   AGENT, | 
 |   SYSTEM | 
 | }; | 
 |  | 
 | /// The distinct address spaces supported by the AMDGPU target for | 
 | /// atomic memory operation. Can be ORed together. | 
 | enum class SIAtomicAddrSpace { | 
 |   NONE = 0u, | 
 |   GLOBAL = 1u << 0, | 
 |   LDS = 1u << 1, | 
 |   SCRATCH = 1u << 2, | 
 |   GDS = 1u << 3, | 
 |   OTHER = 1u << 4, | 
 |  | 
 |   /// The address spaces that can be accessed by a FLAT instruction. | 
 |   FLAT = GLOBAL | LDS | SCRATCH, | 
 |  | 
 |   /// The address spaces that support atomic instructions. | 
 |   ATOMIC = GLOBAL | LDS | SCRATCH | GDS, | 
 |  | 
 |   /// All address spaces. | 
 |   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, | 
 |  | 
 |   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) | 
 | }; | 
 |  | 
 | class SIMemOpInfo final { | 
 | private: | 
 |  | 
 |   friend class SIMemOpAccess; | 
 |  | 
 |   AtomicOrdering Ordering = AtomicOrdering::NotAtomic; | 
 |   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; | 
 |   SIAtomicScope Scope = SIAtomicScope::SYSTEM; | 
 |   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; | 
 |   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; | 
 |   bool IsCrossAddressSpaceOrdering = false; | 
 |   bool IsVolatile = false; | 
 |   bool IsNonTemporal = false; | 
 |   bool IsLastUse = false; | 
 |   bool IsCooperative = false; | 
 |  | 
 |   SIMemOpInfo( | 
 |       const GCNSubtarget &ST, | 
 |       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, | 
 |       SIAtomicScope Scope = SIAtomicScope::SYSTEM, | 
 |       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, | 
 |       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, | 
 |       bool IsCrossAddressSpaceOrdering = true, | 
 |       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, | 
 |       bool IsVolatile = false, bool IsNonTemporal = false, | 
 |       bool IsLastUse = false, bool IsCooperative = false) | 
 |       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), | 
 |         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), | 
 |         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), | 
 |         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), | 
 |         IsLastUse(IsLastUse), IsCooperative(IsCooperative) { | 
 |  | 
 |     if (Ordering == AtomicOrdering::NotAtomic) { | 
 |       assert(!IsCooperative && "Cannot be cooperative & non-atomic!"); | 
 |       assert(Scope == SIAtomicScope::NONE && | 
 |              OrderingAddrSpace == SIAtomicAddrSpace::NONE && | 
 |              !IsCrossAddressSpaceOrdering && | 
 |              FailureOrdering == AtomicOrdering::NotAtomic); | 
 |       return; | 
 |     } | 
 |  | 
 |     assert(Scope != SIAtomicScope::NONE && | 
 |            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != | 
 |                SIAtomicAddrSpace::NONE && | 
 |            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != | 
 |                SIAtomicAddrSpace::NONE); | 
 |  | 
 |     // There is also no cross address space ordering if the ordering | 
 |     // address space is the same as the instruction address space and | 
 |     // only contains a single address space. | 
 |     if ((OrderingAddrSpace == InstrAddrSpace) && | 
 |         isPowerOf2_32(uint32_t(InstrAddrSpace))) | 
 |       this->IsCrossAddressSpaceOrdering = false; | 
 |  | 
 |     // Limit the scope to the maximum supported by the instruction's address | 
 |     // spaces. | 
 |     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == | 
 |         SIAtomicAddrSpace::NONE) { | 
 |       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); | 
 |     } else if ((InstrAddrSpace & | 
 |                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == | 
 |                SIAtomicAddrSpace::NONE) { | 
 |       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); | 
 |     } else if ((InstrAddrSpace & | 
 |                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | | 
 |                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { | 
 |       this->Scope = std::min(Scope, SIAtomicScope::AGENT); | 
 |     } | 
 |  | 
 |     // On targets that have no concept of a workgroup cluster, use | 
 |     // AGENT scope as a conservatively correct alternative. | 
 |     if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters()) | 
 |       this->Scope = SIAtomicScope::AGENT; | 
 |   } | 
 |  | 
 | public: | 
 |   /// \returns Atomic synchronization scope of the machine instruction used to | 
 |   /// create this SIMemOpInfo. | 
 |   SIAtomicScope getScope() const { | 
 |     return Scope; | 
 |   } | 
 |  | 
 |   /// \returns Ordering constraint of the machine instruction used to | 
 |   /// create this SIMemOpInfo. | 
 |   AtomicOrdering getOrdering() const { | 
 |     return Ordering; | 
 |   } | 
 |  | 
 |   /// \returns Failure ordering constraint of the machine instruction used to | 
 |   /// create this SIMemOpInfo. | 
 |   AtomicOrdering getFailureOrdering() const { | 
 |     return FailureOrdering; | 
 |   } | 
 |  | 
 |   /// \returns The address spaces be accessed by the machine | 
 |   /// instruction used to create this SIMemOpInfo. | 
 |   SIAtomicAddrSpace getInstrAddrSpace() const { | 
 |     return InstrAddrSpace; | 
 |   } | 
 |  | 
 |   /// \returns The address spaces that must be ordered by the machine | 
 |   /// instruction used to create this SIMemOpInfo. | 
 |   SIAtomicAddrSpace getOrderingAddrSpace() const { | 
 |     return OrderingAddrSpace; | 
 |   } | 
 |  | 
 |   /// \returns Return true iff memory ordering of operations on | 
 |   /// different address spaces is required. | 
 |   bool getIsCrossAddressSpaceOrdering() const { | 
 |     return IsCrossAddressSpaceOrdering; | 
 |   } | 
 |  | 
 |   /// \returns True if memory access of the machine instruction used to | 
 |   /// create this SIMemOpInfo is volatile, false otherwise. | 
 |   bool isVolatile() const { | 
 |     return IsVolatile; | 
 |   } | 
 |  | 
 |   /// \returns True if memory access of the machine instruction used to | 
 |   /// create this SIMemOpInfo is nontemporal, false otherwise. | 
 |   bool isNonTemporal() const { | 
 |     return IsNonTemporal; | 
 |   } | 
 |  | 
 |   /// \returns True if memory access of the machine instruction used to | 
 |   /// create this SIMemOpInfo is last use, false otherwise. | 
 |   bool isLastUse() const { return IsLastUse; } | 
 |  | 
 |   /// \returns True if this is a cooperative load or store atomic. | 
 |   bool isCooperative() const { return IsCooperative; } | 
 |  | 
 |   /// \returns True if ordering constraint of the machine instruction used to | 
 |   /// create this SIMemOpInfo is unordered or higher, false otherwise. | 
 |   bool isAtomic() const { | 
 |     return Ordering != AtomicOrdering::NotAtomic; | 
 |   } | 
 |  | 
 | }; | 
 |  | 
 | class SIMemOpAccess final { | 
 | private: | 
 |   const AMDGPUMachineModuleInfo *MMI = nullptr; | 
 |   const GCNSubtarget &ST; | 
 |  | 
 |   /// Reports unsupported message \p Msg for \p MI to LLVM context. | 
 |   void reportUnsupported(const MachineBasicBlock::iterator &MI, | 
 |                          const char *Msg) const; | 
 |  | 
 |   /// Inspects the target synchronization scope \p SSID and determines | 
 |   /// the SI atomic scope it corresponds to, the address spaces it | 
 |   /// covers, and whether the memory ordering applies between address | 
 |   /// spaces. | 
 |   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> | 
 |   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; | 
 |  | 
 |   /// \return Return a bit set of the address spaces accessed by \p AS. | 
 |   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; | 
 |  | 
 |   /// \returns Info constructed from \p MI, which has at least machine memory | 
 |   /// operand. | 
 |   std::optional<SIMemOpInfo> | 
 |   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; | 
 |  | 
 | public: | 
 |   /// Construct class to support accessing the machine memory operands | 
 |   /// of instructions in the machine function \p MF. | 
 |   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST); | 
 |  | 
 |   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. | 
 |   std::optional<SIMemOpInfo> | 
 |   getLoadInfo(const MachineBasicBlock::iterator &MI) const; | 
 |  | 
 |   /// \returns Store info if \p MI is a store operation, "std::nullopt" | 
 |   /// otherwise. | 
 |   std::optional<SIMemOpInfo> | 
 |   getStoreInfo(const MachineBasicBlock::iterator &MI) const; | 
 |  | 
 |   /// \returns Atomic fence info if \p MI is an atomic fence operation, | 
 |   /// "std::nullopt" otherwise. | 
 |   std::optional<SIMemOpInfo> | 
 |   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; | 
 |  | 
 |   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or | 
 |   /// rmw operation, "std::nullopt" otherwise. | 
 |   std::optional<SIMemOpInfo> | 
 |   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; | 
 | }; | 
 |  | 
 | class SICacheControl { | 
 | protected: | 
 |  | 
 |   /// AMDGPU subtarget info. | 
 |   const GCNSubtarget &ST; | 
 |  | 
 |   /// Instruction info. | 
 |   const SIInstrInfo *TII = nullptr; | 
 |  | 
 |   IsaVersion IV; | 
 |  | 
 |   /// Whether to insert cache invalidating instructions. | 
 |   bool InsertCacheInv; | 
 |  | 
 |   SICacheControl(const GCNSubtarget &ST); | 
 |  | 
 |   /// Sets named bit \p BitName to "true" if present in instruction \p MI. | 
 |   /// \returns Returns true if \p MI is modified, false otherwise. | 
 |   bool enableNamedBit(const MachineBasicBlock::iterator MI, | 
 |                       AMDGPU::CPol::CPol Bit) const; | 
 |  | 
 | public: | 
 |  | 
 |   /// Create a cache control for the subtarget \p ST. | 
 |   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); | 
 |  | 
 |   /// Update \p MI memory load instruction to bypass any caches up to | 
 |   /// the \p Scope memory scope for address spaces \p | 
 |   /// AddrSpace. Return true iff the instruction was modified. | 
 |   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                                      SIAtomicScope Scope, | 
 |                                      SIAtomicAddrSpace AddrSpace) const = 0; | 
 |  | 
 |   /// Update \p MI memory store instruction to bypass any caches up to | 
 |   /// the \p Scope memory scope for address spaces \p | 
 |   /// AddrSpace. Return true iff the instruction was modified. | 
 |   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicScope Scope, | 
 |                                       SIAtomicAddrSpace AddrSpace) const = 0; | 
 |  | 
 |   /// Update \p MI memory read-modify-write instruction to bypass any caches up | 
 |   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true | 
 |   /// iff the instruction was modified. | 
 |   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                                     SIAtomicScope Scope, | 
 |                                     SIAtomicAddrSpace AddrSpace) const = 0; | 
 |  | 
 |   /// Update \p MI memory instruction of kind \p Op associated with address | 
 |   /// spaces \p AddrSpace to indicate it is volatile and/or | 
 |   /// nontemporal/last-use. Return true iff the instruction was modified. | 
 |   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                               SIAtomicAddrSpace AddrSpace, | 
 |                                               SIMemOp Op, bool IsVolatile, | 
 |                                               bool IsNonTemporal, | 
 |                                               bool IsLastUse = false) const = 0; | 
 |  | 
 |   virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { | 
 |     return false; | 
 |   }; | 
 |  | 
 |   /// Handle cooperative load/store atomics. | 
 |   virtual bool handleCooperativeAtomic(MachineInstr &MI) const { | 
 |     llvm_unreachable( | 
 |         "cooperative atomics are not available on this architecture"); | 
 |   } | 
 |  | 
 |   /// Inserts any necessary instructions at position \p Pos relative | 
 |   /// to instruction \p MI to ensure memory instructions before \p Pos of kind | 
 |   /// \p Op associated with address spaces \p AddrSpace have completed. Used | 
 |   /// between memory instructions to enforce the order they become visible as | 
 |   /// observed by other memory instructions executing in memory scope \p Scope. | 
 |   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between | 
 |   /// address spaces. Returns true iff any instructions inserted. | 
 |   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                           SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                           bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                           AtomicOrdering Order) const = 0; | 
 |  | 
 |   /// Inserts any necessary instructions at position \p Pos relative to | 
 |   /// instruction \p MI to ensure any subsequent memory instructions of this | 
 |   /// thread with address spaces \p AddrSpace will observe the previous memory | 
 |   /// operations by any thread for memory scopes up to memory scope \p Scope . | 
 |   /// Returns true iff any instructions inserted. | 
 |   virtual bool insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace, | 
 |                              Position Pos) const = 0; | 
 |  | 
 |   /// Inserts any necessary instructions at position \p Pos relative to | 
 |   /// instruction \p MI to ensure previous memory instructions by this thread | 
 |   /// with address spaces \p AddrSpace have completed and can be observed by | 
 |   /// subsequent memory instructions by any thread executing in memory scope \p | 
 |   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is | 
 |   /// between address spaces. Returns true iff any instructions inserted. | 
 |   virtual bool insertRelease(MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace, | 
 |                              bool IsCrossAddrSpaceOrdering, | 
 |                              Position Pos) const = 0; | 
 |  | 
 |   /// Inserts any necessary instructions before the barrier start instruction | 
 |   /// \p MI in order to support pairing of barriers and fences. | 
 |   virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { | 
 |     return false; | 
 |   }; | 
 |  | 
 |   /// Virtual destructor to allow derivations to be deleted. | 
 |   virtual ~SICacheControl() = default; | 
 | }; | 
 |  | 
 | class SIGfx6CacheControl : public SICacheControl { | 
 | protected: | 
 |  | 
 |   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::GLC); | 
 |   } | 
 |  | 
 |   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::SLC); | 
 |   } | 
 |  | 
 | public: | 
 |  | 
 |   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                               SIAtomicScope Scope, | 
 |                               SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                             SIAtomicScope Scope, | 
 |                             SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 |  | 
 |   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                   SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                   bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                   AtomicOrdering Order) const override; | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      Position Pos) const override; | 
 |  | 
 |   bool insertRelease(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      bool IsCrossAddrSpaceOrdering, | 
 |                      Position Pos) const override; | 
 | }; | 
 |  | 
 | class SIGfx7CacheControl : public SIGfx6CacheControl { | 
 | public: | 
 |  | 
 |   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      Position Pos) const override; | 
 |  | 
 | }; | 
 |  | 
 | class SIGfx90ACacheControl : public SIGfx7CacheControl { | 
 | public: | 
 |  | 
 |   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                               SIAtomicScope Scope, | 
 |                               SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                             SIAtomicScope Scope, | 
 |                             SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 |  | 
 |   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                   SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                   bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                   AtomicOrdering Order) const override; | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      Position Pos) const override; | 
 |  | 
 |   bool insertRelease(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      bool IsCrossAddrSpaceOrdering, | 
 |                      Position Pos) const override; | 
 | }; | 
 |  | 
 | class SIGfx940CacheControl : public SIGfx90ACacheControl { | 
 | protected: | 
 |  | 
 |   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::SC0); | 
 |   } | 
 |  | 
 |   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::SC1); | 
 |   } | 
 |  | 
 |   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableNTBit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::NT); | 
 |   } | 
 |  | 
 | public: | 
 |   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                               SIAtomicScope Scope, | 
 |                               SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                             SIAtomicScope Scope, | 
 |                             SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, Position Pos) const override; | 
 |  | 
 |   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, | 
 |                      Position Pos) const override; | 
 | }; | 
 |  | 
 | class SIGfx10CacheControl : public SIGfx7CacheControl { | 
 | protected: | 
 |  | 
 |   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI | 
 |   /// is modified, false otherwise. | 
 |   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { | 
 |     return enableNamedBit(MI, AMDGPU::CPol::DLC); | 
 |   } | 
 |  | 
 | public: | 
 |  | 
 |   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 |  | 
 |   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                   SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                   bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                   AtomicOrdering Order) const override; | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                      SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, | 
 |                      Position Pos) const override; | 
 |  | 
 |   bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; | 
 | }; | 
 |  | 
 | class SIGfx11CacheControl : public SIGfx10CacheControl { | 
 | public: | 
 |   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 | }; | 
 |  | 
 | class SIGfx12CacheControl : public SIGfx11CacheControl { | 
 | protected: | 
 |   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. | 
 |   // \returns Returns true if \p MI is modified, false otherwise. | 
 |   bool setTH(const MachineBasicBlock::iterator MI, | 
 |              AMDGPU::CPol::CPol Value) const; | 
 |   // Sets Scope policy to \p Value if CPol operand is present in instruction \p | 
 |   // MI. \returns Returns true if \p MI is modified, false otherwise. | 
 |   bool setScope(const MachineBasicBlock::iterator MI, | 
 |                 AMDGPU::CPol::CPol Value) const; | 
 |  | 
 |   // Stores with system scope (SCOPE_SYS) need to wait for: | 
 |   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 | 
 |   // - non-returning-atomics       - wait for STORECNT==0 | 
 |   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits | 
 |   //   since it does not distinguish atomics-with-return from regular stores. | 
 |   // There is no need to wait if memory is cached (mtype != UC). | 
 |   bool | 
 |   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; | 
 |  | 
 |   bool setAtomicScope(const MachineBasicBlock::iterator &MI, | 
 |                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; | 
 |  | 
 | public: | 
 |   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { | 
 |     // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases | 
 |     // the behavior is the same if assuming GFX12.0 in CU mode. | 
 |     assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); | 
 |   } | 
 |  | 
 |   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                   SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                   bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                   AtomicOrdering Order) const override; | 
 |  | 
 |   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, Position Pos) const override; | 
 |  | 
 |   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsVolatile, bool IsNonTemporal, | 
 |                                       bool IsLastUse) const override; | 
 |  | 
 |   bool finalizeStore(MachineInstr &MI, bool Atomic) const override; | 
 |  | 
 |   virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; | 
 |  | 
 |   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, | 
 |                      Position Pos) const override; | 
 |  | 
 |   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                              SIAtomicScope Scope, | 
 |                              SIAtomicAddrSpace AddrSpace) const override { | 
 |     return setAtomicScope(MI, Scope, AddrSpace); | 
 |   } | 
 |  | 
 |   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                               SIAtomicScope Scope, | 
 |                               SIAtomicAddrSpace AddrSpace) const override { | 
 |     return setAtomicScope(MI, Scope, AddrSpace); | 
 |   } | 
 |  | 
 |   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, | 
 |                             SIAtomicScope Scope, | 
 |                             SIAtomicAddrSpace AddrSpace) const override { | 
 |     return setAtomicScope(MI, Scope, AddrSpace); | 
 |   } | 
 | }; | 
 |  | 
 | class SIMemoryLegalizer final { | 
 | private: | 
 |   const MachineModuleInfo &MMI; | 
 |   /// Cache Control. | 
 |   std::unique_ptr<SICacheControl> CC = nullptr; | 
 |  | 
 |   /// List of atomic pseudo instructions. | 
 |   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; | 
 |  | 
 |   /// Return true iff instruction \p MI is a atomic instruction that | 
 |   /// returns a result. | 
 |   bool isAtomicRet(const MachineInstr &MI) const { | 
 |     return SIInstrInfo::isAtomicRet(MI); | 
 |   } | 
 |  | 
 |   /// Removes all processed atomic pseudo instructions from the current | 
 |   /// function. Returns true if current function is modified, false otherwise. | 
 |   bool removeAtomicPseudoMIs(); | 
 |  | 
 |   /// Expands load operation \p MI. Returns true if instructions are | 
 |   /// added/deleted or \p MI is modified, false otherwise. | 
 |   bool expandLoad(const SIMemOpInfo &MOI, | 
 |                   MachineBasicBlock::iterator &MI); | 
 |   /// Expands store operation \p MI. Returns true if instructions are | 
 |   /// added/deleted or \p MI is modified, false otherwise. | 
 |   bool expandStore(const SIMemOpInfo &MOI, | 
 |                    MachineBasicBlock::iterator &MI); | 
 |   /// Expands atomic fence operation \p MI. Returns true if | 
 |   /// instructions are added/deleted or \p MI is modified, false otherwise. | 
 |   bool expandAtomicFence(const SIMemOpInfo &MOI, | 
 |                          MachineBasicBlock::iterator &MI); | 
 |   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if | 
 |   /// instructions are added/deleted or \p MI is modified, false otherwise. | 
 |   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, | 
 |                                 MachineBasicBlock::iterator &MI); | 
 |  | 
 | public: | 
 |   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; | 
 |   bool run(MachineFunction &MF); | 
 | }; | 
 |  | 
 | class SIMemoryLegalizerLegacy final : public MachineFunctionPass { | 
 | public: | 
 |   static char ID; | 
 |  | 
 |   SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {} | 
 |  | 
 |   void getAnalysisUsage(AnalysisUsage &AU) const override { | 
 |     AU.setPreservesCFG(); | 
 |     MachineFunctionPass::getAnalysisUsage(AU); | 
 |   } | 
 |  | 
 |   StringRef getPassName() const override { | 
 |     return PASS_NAME; | 
 |   } | 
 |  | 
 |   bool runOnMachineFunction(MachineFunction &MF) override; | 
 | }; | 
 |  | 
 | static const StringMap<SIAtomicAddrSpace> ASNames = {{ | 
 |     {"global", SIAtomicAddrSpace::GLOBAL}, | 
 |     {"local", SIAtomicAddrSpace::LDS}, | 
 | }}; | 
 |  | 
 | void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { | 
 |   const MachineFunction *MF = MI.getMF(); | 
 |   const Function &Fn = MF->getFunction(); | 
 |   SmallString<128> Str; | 
 |   raw_svector_ostream OS(Str); | 
 |   OS << "unknown address space '" << AS << "'; expected one of "; | 
 |   ListSeparator LS; | 
 |   for (const auto &[Name, Val] : ASNames) | 
 |     OS << LS << '\'' << Name << '\''; | 
 |   Fn.getContext().diagnose( | 
 |       DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); | 
 | } | 
 |  | 
 | /// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA. | 
 | /// If this tag isn't present, or if it has no meaningful values, returns | 
 | /// \p none, otherwise returns the address spaces specified by the MD. | 
 | static std::optional<SIAtomicAddrSpace> | 
 | getSynchronizeAddrSpaceMD(const MachineInstr &MI) { | 
 |   static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as"; | 
 |  | 
 |   auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); | 
 |   if (!MMRA) | 
 |     return std::nullopt; | 
 |  | 
 |   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; | 
 |   for (const auto &[Prefix, Suffix] : MMRA) { | 
 |     if (Prefix != FenceASPrefix) | 
 |       continue; | 
 |  | 
 |     if (auto It = ASNames.find(Suffix); It != ASNames.end()) | 
 |       Result |= It->second; | 
 |     else | 
 |       diagnoseUnknownMMRAASName(MI, Suffix); | 
 |   } | 
 |  | 
 |   if (Result == SIAtomicAddrSpace::NONE) | 
 |     return std::nullopt; | 
 |  | 
 |   return Result; | 
 | } | 
 |  | 
 | } // end anonymous namespace | 
 |  | 
 | void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, | 
 |                                       const char *Msg) const { | 
 |   const Function &Func = MI->getParent()->getParent()->getFunction(); | 
 |   Func.getContext().diagnose( | 
 |       DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc())); | 
 | } | 
 |  | 
 | std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> | 
 | SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, | 
 |                                SIAtomicAddrSpace InstrAddrSpace) const { | 
 |   if (SSID == SyncScope::System) | 
 |     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); | 
 |   if (SSID == MMI->getAgentSSID()) | 
 |     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); | 
 |   if (SSID == MMI->getClusterSSID()) | 
 |     return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true); | 
 |   if (SSID == MMI->getWorkgroupSSID()) | 
 |     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, | 
 |                       true); | 
 |   if (SSID == MMI->getWavefrontSSID()) | 
 |     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, | 
 |                       true); | 
 |   if (SSID == SyncScope::SingleThread) | 
 |     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, | 
 |                       true); | 
 |   if (SSID == MMI->getSystemOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::SYSTEM, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   if (SSID == MMI->getAgentOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::AGENT, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   if (SSID == MMI->getClusterOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::CLUSTER, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::WORKGROUP, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::WAVEFRONT, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) | 
 |     return std::tuple(SIAtomicScope::SINGLETHREAD, | 
 |                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); | 
 |   return std::nullopt; | 
 | } | 
 |  | 
 | SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { | 
 |   if (AS == AMDGPUAS::FLAT_ADDRESS) | 
 |     return SIAtomicAddrSpace::FLAT; | 
 |   if (AS == AMDGPUAS::GLOBAL_ADDRESS) | 
 |     return SIAtomicAddrSpace::GLOBAL; | 
 |   if (AS == AMDGPUAS::LOCAL_ADDRESS) | 
 |     return SIAtomicAddrSpace::LDS; | 
 |   if (AS == AMDGPUAS::PRIVATE_ADDRESS) | 
 |     return SIAtomicAddrSpace::SCRATCH; | 
 |   if (AS == AMDGPUAS::REGION_ADDRESS) | 
 |     return SIAtomicAddrSpace::GDS; | 
 |  | 
 |   return SIAtomicAddrSpace::OTHER; | 
 | } | 
 |  | 
 | SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_, | 
 |                              const GCNSubtarget &ST) | 
 |     : MMI(&MMI_), ST(ST) {} | 
 |  | 
 | std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( | 
 |     const MachineBasicBlock::iterator &MI) const { | 
 |   assert(MI->getNumMemOperands() > 0); | 
 |  | 
 |   SyncScope::ID SSID = SyncScope::SingleThread; | 
 |   AtomicOrdering Ordering = AtomicOrdering::NotAtomic; | 
 |   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; | 
 |   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; | 
 |   bool IsNonTemporal = true; | 
 |   bool IsVolatile = false; | 
 |   bool IsLastUse = false; | 
 |   bool IsCooperative = false; | 
 |  | 
 |   // Validator should check whether or not MMOs cover the entire set of | 
 |   // locations accessed by the memory instruction. | 
 |   for (const auto &MMO : MI->memoperands()) { | 
 |     IsNonTemporal &= MMO->isNonTemporal(); | 
 |     IsVolatile |= MMO->isVolatile(); | 
 |     IsLastUse |= MMO->getFlags() & MOLastUse; | 
 |     IsCooperative |= MMO->getFlags() & MOCooperative; | 
 |     InstrAddrSpace |= | 
 |       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); | 
 |     AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); | 
 |     if (OpOrdering != AtomicOrdering::NotAtomic) { | 
 |       const auto &IsSyncScopeInclusion = | 
 |           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); | 
 |       if (!IsSyncScopeInclusion) { | 
 |         reportUnsupported(MI, | 
 |           "Unsupported non-inclusive atomic synchronization scope"); | 
 |         return std::nullopt; | 
 |       } | 
 |  | 
 |       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); | 
 |       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); | 
 |       assert(MMO->getFailureOrdering() != AtomicOrdering::Release && | 
 |              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); | 
 |       FailureOrdering = | 
 |           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); | 
 |     } | 
 |   } | 
 |  | 
 |   SIAtomicScope Scope = SIAtomicScope::NONE; | 
 |   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; | 
 |   bool IsCrossAddressSpaceOrdering = false; | 
 |   if (Ordering != AtomicOrdering::NotAtomic) { | 
 |     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); | 
 |     if (!ScopeOrNone) { | 
 |       reportUnsupported(MI, "Unsupported atomic synchronization scope"); | 
 |       return std::nullopt; | 
 |     } | 
 |     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = | 
 |         *ScopeOrNone; | 
 |     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || | 
 |         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || | 
 |         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { | 
 |       reportUnsupported(MI, "Unsupported atomic address space"); | 
 |       return std::nullopt; | 
 |     } | 
 |   } | 
 |   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, | 
 |                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, | 
 |                      IsNonTemporal, IsLastUse, IsCooperative); | 
 | } | 
 |  | 
 | std::optional<SIMemOpInfo> | 
 | SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { | 
 |   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); | 
 |  | 
 |   if (!(MI->mayLoad() && !MI->mayStore())) | 
 |     return std::nullopt; | 
 |  | 
 |   // Be conservative if there are no memory operands. | 
 |   if (MI->getNumMemOperands() == 0) | 
 |     return SIMemOpInfo(ST); | 
 |  | 
 |   return constructFromMIWithMMO(MI); | 
 | } | 
 |  | 
 | std::optional<SIMemOpInfo> | 
 | SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { | 
 |   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); | 
 |  | 
 |   if (!(!MI->mayLoad() && MI->mayStore())) | 
 |     return std::nullopt; | 
 |  | 
 |   // Be conservative if there are no memory operands. | 
 |   if (MI->getNumMemOperands() == 0) | 
 |     return SIMemOpInfo(ST); | 
 |  | 
 |   return constructFromMIWithMMO(MI); | 
 | } | 
 |  | 
 | std::optional<SIMemOpInfo> | 
 | SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { | 
 |   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); | 
 |  | 
 |   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) | 
 |     return std::nullopt; | 
 |  | 
 |   AtomicOrdering Ordering = | 
 |     static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); | 
 |  | 
 |   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); | 
 |   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); | 
 |   if (!ScopeOrNone) { | 
 |     reportUnsupported(MI, "Unsupported atomic synchronization scope"); | 
 |     return std::nullopt; | 
 |   } | 
 |  | 
 |   SIAtomicScope Scope = SIAtomicScope::NONE; | 
 |   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; | 
 |   bool IsCrossAddressSpaceOrdering = false; | 
 |   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = | 
 |       *ScopeOrNone; | 
 |  | 
 |   if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) { | 
 |     // We currently expect refineOrderingAS to be the only place that | 
 |     // can refine the AS ordered by the fence. | 
 |     // If that changes, we need to review the semantics of that function | 
 |     // in case it needs to preserve certain address spaces. | 
 |     reportUnsupported(MI, "Unsupported atomic address space"); | 
 |     return std::nullopt; | 
 |   } | 
 |  | 
 |   auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI); | 
 |   if (SynchronizeAS) | 
 |     OrderingAddrSpace = *SynchronizeAS; | 
 |  | 
 |   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, | 
 |                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, | 
 |                      AtomicOrdering::NotAtomic); | 
 | } | 
 |  | 
 | std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( | 
 |     const MachineBasicBlock::iterator &MI) const { | 
 |   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); | 
 |  | 
 |   if (!(MI->mayLoad() && MI->mayStore())) | 
 |     return std::nullopt; | 
 |  | 
 |   // Be conservative if there are no memory operands. | 
 |   if (MI->getNumMemOperands() == 0) | 
 |     return SIMemOpInfo(ST); | 
 |  | 
 |   return constructFromMIWithMMO(MI); | 
 | } | 
 |  | 
 | SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { | 
 |   TII = ST.getInstrInfo(); | 
 |   IV = getIsaVersion(ST.getCPU()); | 
 |   InsertCacheInv = !AmdgcnSkipCacheInvalidations; | 
 | } | 
 |  | 
 | bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, | 
 |                                     AMDGPU::CPol::CPol Bit) const { | 
 |   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); | 
 |   if (!CPol) | 
 |     return false; | 
 |  | 
 |   CPol->setImm(CPol->getImm() | Bit); | 
 |   return true; | 
 | } | 
 |  | 
 | /* static */ | 
 | std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { | 
 |   GCNSubtarget::Generation Generation = ST.getGeneration(); | 
 |   if (ST.hasGFX940Insts()) | 
 |     return std::make_unique<SIGfx940CacheControl>(ST); | 
 |   if (ST.hasGFX90AInsts()) | 
 |     return std::make_unique<SIGfx90ACacheControl>(ST); | 
 |   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) | 
 |     return std::make_unique<SIGfx6CacheControl>(ST); | 
 |   if (Generation < AMDGPUSubtarget::GFX10) | 
 |     return std::make_unique<SIGfx7CacheControl>(ST); | 
 |   if (Generation < AMDGPUSubtarget::GFX11) | 
 |     return std::make_unique<SIGfx10CacheControl>(ST); | 
 |   if (Generation < AMDGPUSubtarget::GFX12) | 
 |     return std::make_unique<SIGfx11CacheControl>(ST); | 
 |   return std::make_unique<SIGfx12CacheControl>(ST); | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::enableLoadCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set L1 cache policy to MISS_EVICT. | 
 |       // Note: there is no L2 cache bypass policy at the ISA level. | 
 |       Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::enableStoreCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(!MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   /// The L1 cache is write through so does not need to be bypassed. There is no | 
 |   /// bypass control for the L2 cache at the isa level. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::enableRMWCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically | 
 |   /// bypassed, and the GLC bit is instead used to indicate if they are | 
 |   /// return or no-return. | 
 |   /// Note: there is no L2 cache coherent bypass control at the ISA level. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |   // Only handle load and store, not atomic read-modify-write insructions. The | 
 |   // latter use glc to indicate if the atomic returns a result and so must not | 
 |   // be used for cache control. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsVolatile) { | 
 |     // Set L1 cache policy to be MISS_EVICT for load instructions | 
 |     // and MISS_LRU for store instructions. | 
 |     // Note: there is no L2 cache bypass policy at the ISA level. | 
 |     if (Op == SIMemOp::LOAD) | 
 |       Changed |= enableGLCBit(MI); | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   if (IsNonTemporal) { | 
 |     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT | 
 |     // for both loads and stores, and the L2 cache policy to STREAM. | 
 |     Changed |= enableGLCBit(MI); | 
 |     Changed |= enableSLCBit(MI); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, | 
 |                                     SIAtomicScope Scope, | 
 |                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                     bool IsCrossAddrSpaceOrdering, Position Pos, | 
 |                                     AtomicOrdering Order) const { | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   bool VMCnt = false; | 
 |   bool LGKMCnt = false; | 
 |  | 
 |   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != | 
 |       SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       VMCnt |= true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The L1 cache keeps all memory operations in order for | 
 |       // wavefronts in the same work-group. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is | 
 |       // not needed as LDS operations for all waves are executed in a total | 
 |       // global ordering as observed by all waves. Required if also | 
 |       // synchronizing with global/GDS memory as LDS operations could be | 
 |       // reordered with respect to later global/GDS memory operations of the | 
 |       // same wave. | 
 |       LGKMCnt |= IsCrossAddrSpaceOrdering; | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The LDS keeps all memory operations in order for | 
 |       // the same wavefront. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" | 
 |       // is not needed as GDS operations for all waves are executed in a total | 
 |       // global ordering as observed by all waves. Required if also | 
 |       // synchronizing with global/LDS memory as GDS operations could be | 
 |       // reordered with respect to later global/LDS memory operations of the | 
 |       // same wave. | 
 |       LGKMCnt |= IsCrossAddrSpaceOrdering; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The GDS keeps all memory operations in order for | 
 |       // the same work-group. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if (VMCnt || LGKMCnt) { | 
 |     unsigned WaitCntImmediate = | 
 |       AMDGPU::encodeWaitcnt(IV, | 
 |                             VMCnt ? 0 : getVmcntBitMask(IV), | 
 |                             getExpcntBitMask(IV), | 
 |                             LGKMCnt ? 0 : getLgkmcntBitMask(IV)); | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) | 
 |         .addImm(WaitCntImmediate); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   // On architectures that support direct loads to LDS, emit an unknown waitcnt | 
 |   // at workgroup-scoped release operations that specify the LDS address space. | 
 |   // SIInsertWaitcnts will later replace this with a vmcnt(). | 
 |   if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | 
 |       Scope == SIAtomicScope::WORKGROUP && | 
 |       (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                        SIAtomicScope Scope, | 
 |                                        SIAtomicAddrSpace AddrSpace, | 
 |                                        Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to invalidate. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, | 
 |                                        SIAtomicScope Scope, | 
 |                                        SIAtomicAddrSpace AddrSpace, | 
 |                                        bool IsCrossAddrSpaceOrdering, | 
 |                                        Position Pos) const { | 
 |   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, | 
 |                     IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); | 
 | } | 
 |  | 
 | bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                        SIAtomicScope Scope, | 
 |                                        SIAtomicAddrSpace AddrSpace, | 
 |                                        Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); | 
 |  | 
 |   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() | 
 |                                     ? AMDGPU::BUFFER_WBINVL1 | 
 |                                     : AMDGPU::BUFFER_WBINVL1_VOL; | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to invalidate. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::enableLoadCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set the L1 cache policy to MISS_LRU. | 
 |       // Note: there is no L2 cache bypass policy at the ISA level. | 
 |       Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In threadgroup split mode the waves of a work-group can be executing on | 
 |       // different CUs. Therefore need to bypass the L1 which is per CU. | 
 |       // Otherwise in non-threadgroup split mode all waves of a work-group are | 
 |       // on the same CU, and so the L1 does not need to be bypassed. | 
 |       if (ST.isTgSplitEnabled()) | 
 |         Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::enableStoreCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(!MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       /// Do not set glc for store atomic operations as they implicitly write | 
 |       /// through the L1 cache. | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. Store atomics implicitly write through the L1 | 
 |       // cache. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::enableRMWCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       /// Do not set glc for RMW atomic operations as they implicitly bypass | 
 |       /// the L1 cache, and the glc bit is instead used to indicate if they are | 
 |       /// return or no-return. | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. RMW atomics implicitly bypass the L1 cache. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |   // Only handle load and store, not atomic read-modify-write insructions. The | 
 |   // latter use glc to indicate if the atomic returns a result and so must not | 
 |   // be used for cache control. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsVolatile) { | 
 |     // Set L1 cache policy to be MISS_EVICT for load instructions | 
 |     // and MISS_LRU for store instructions. | 
 |     // Note: there is no L2 cache bypass policy at the ISA level. | 
 |     if (Op == SIMemOp::LOAD) | 
 |       Changed |= enableGLCBit(MI); | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   if (IsNonTemporal) { | 
 |     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT | 
 |     // for both loads and stores, and the L2 cache policy to STREAM. | 
 |     Changed |= enableGLCBit(MI); | 
 |     Changed |= enableSLCBit(MI); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, | 
 |                                       SIAtomicScope Scope, | 
 |                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                       bool IsCrossAddrSpaceOrdering, | 
 |                                       Position Pos, | 
 |                                       AtomicOrdering Order) const { | 
 |   if (ST.isTgSplitEnabled()) { | 
 |     // In threadgroup split mode the waves of a work-group can be executing on | 
 |     // different CUs. Therefore need to wait for global or GDS memory operations | 
 |     // to complete to ensure they are visible to waves in the other CUs. | 
 |     // Otherwise in non-threadgroup split mode all waves of a work-group are on | 
 |     // the same CU, so no need to wait for global memory as all waves in the | 
 |     // work-group access the same the L1, nor wait for GDS as access are ordered | 
 |     // on a CU. | 
 |     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | | 
 |                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && | 
 |         (Scope == SIAtomicScope::WORKGROUP)) { | 
 |       // Same as GFX7 using agent scope. | 
 |       Scope = SIAtomicScope::AGENT; | 
 |     } | 
 |     // In threadgroup split mode LDS cannot be allocated so no need to wait for | 
 |     // LDS memory operations. | 
 |     AddrSpace &= ~SIAtomicAddrSpace::LDS; | 
 |   } | 
 |   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, | 
 |                                         IsCrossAddrSpaceOrdering, Pos, Order); | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                          SIAtomicScope Scope, | 
 |                                          SIAtomicAddrSpace AddrSpace, | 
 |                                          Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Ensures that following loads will not see stale remote VMEM data or | 
 |       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and | 
 |       // CC will never be stale due to the local memory probes. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); | 
 |       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the | 
 |       // hardware does not reorder memory operations by the same wave with | 
 |       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to | 
 |       // remove any cache lines of earlier writes by the same wave and ensures | 
 |       // later reads by the same wave will refetch the cache lines. | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       // Same as GFX7. | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In threadgroup split mode the waves of a work-group can be executing on | 
 |       // different CUs. Therefore need to invalidate the L1 which is per CU. | 
 |       // Otherwise in non-threadgroup split mode all waves of a work-group are | 
 |       // on the same CU, and so the L1 does not need to be invalidated. | 
 |       if (ST.isTgSplitEnabled()) { | 
 |         // Same as GFX7 using agent scope. | 
 |         Scope = SIAtomicScope::AGENT; | 
 |       } | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Same as GFX7. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, | 
 |                                          SIAtomicScope Scope, | 
 |                                          SIAtomicAddrSpace AddrSpace, | 
 |                                          bool IsCrossAddrSpaceOrdering, | 
 |                                          Position Pos) const { | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   const DebugLoc &DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the | 
 |       // hardware does not reorder memory operations by the same wave with | 
 |       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed | 
 |       // to initiate writeback of any dirty cache lines of earlier writes by the | 
 |       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the | 
 |       // writeback has completed. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) | 
 |         // Set SC bits to indicate system scope. | 
 |         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); | 
 |       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT | 
 |       // vmcnt(0)" needed by the "BUFFER_WBL2". | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Same as GFX7. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   Changed |= | 
 |       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, | 
 |                                         IsCrossAddrSpaceOrdering, Pos); | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::enableLoadCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Set SC bits to indicate system scope. | 
 |       Changed |= enableSC0Bit(MI); | 
 |       Changed |= enableSC1Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set SC bits to indicate agent scope. | 
 |       Changed |= enableSC1Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In threadgroup split mode the waves of a work-group can be executing on | 
 |       // different CUs. Therefore need to bypass the L1 which is per CU. | 
 |       // Otherwise in non-threadgroup split mode all waves of a work-group are | 
 |       // on the same CU, and so the L1 does not need to be bypassed. Setting SC | 
 |       // bits to indicate work-group scope will do this automatically. | 
 |       Changed |= enableSC0Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Leave SC bits unset to indicate wavefront scope. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::enableStoreCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(!MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Set SC bits to indicate system scope. | 
 |       Changed |= enableSC0Bit(MI); | 
 |       Changed |= enableSC1Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set SC bits to indicate agent scope. | 
 |       Changed |= enableSC1Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // Set SC bits to indicate workgroup scope. | 
 |       Changed |= enableSC0Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Leave SC bits unset to indicate wavefront scope. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::enableRMWCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Set SC1 bit to indicate system scope. | 
 |       Changed |= enableSC1Bit(MI); | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // RMW atomic operations implicitly bypass the L1 cache and only use SC1 | 
 |       // to indicate system or agent scope. The SC0 bit is used to indicate if | 
 |       // they are return or no-return. Leave SC1 bit unset to indicate agent | 
 |       // scope. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |   // Only handle load and store, not atomic read-modify-write insructions. The | 
 |   // latter use glc to indicate if the atomic returns a result and so must not | 
 |   // be used for cache control. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsVolatile) { | 
 |     // Set SC bits to indicate system scope. | 
 |     Changed |= enableSC0Bit(MI); | 
 |     Changed |= enableSC1Bit(MI); | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   if (IsNonTemporal) { | 
 |     Changed |= enableNTBit(MI); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                          SIAtomicScope Scope, | 
 |                                          SIAtomicAddrSpace AddrSpace, | 
 |                                          Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Ensures that following loads will not see stale remote VMEM data or | 
 |       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and | 
 |       // CC will never be stale due to the local memory probes. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) | 
 |           // Set SC bits to indicate system scope. | 
 |           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); | 
 |       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the | 
 |       // hardware does not reorder memory operations by the same wave with | 
 |       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to | 
 |       // remove any cache lines of earlier writes by the same wave and ensures | 
 |       // later reads by the same wave will refetch the cache lines. | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       // Ensures that following loads will not see stale remote date or local | 
 |       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale | 
 |       // due to the memory probes. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) | 
 |           // Set SC bits to indicate agent scope. | 
 |           .addImm(AMDGPU::CPol::SC1); | 
 |       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware | 
 |       // does not reorder memory operations with respect to preceeding buffer | 
 |       // invalidate. The invalidate is guaranteed to remove any cache lines of | 
 |       // earlier writes and ensures later writes will refetch the cache lines. | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In threadgroup split mode the waves of a work-group can be executing on | 
 |       // different CUs. Therefore need to invalidate the L1 which is per CU. | 
 |       // Otherwise in non-threadgroup split mode all waves of a work-group are | 
 |       // on the same CU, and so the L1 does not need to be invalidated. | 
 |       if (ST.isTgSplitEnabled()) { | 
 |         // Ensures L1 is invalidated if in threadgroup split mode. In | 
 |         // non-threadgroup split mode it is a NOP, but no point generating it in | 
 |         // that case if know not in that mode. | 
 |         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) | 
 |             // Set SC bits to indicate work-group scope. | 
 |             .addImm(AMDGPU::CPol::SC0); | 
 |         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware | 
 |         // does not reorder memory operations with respect to preceeding buffer | 
 |         // invalidate. The invalidate is guaranteed to remove any cache lines of | 
 |         // earlier writes and ensures later writes will refetch the cache lines. | 
 |         Changed = true; | 
 |       } | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Could generate "BUFFER_INV" but it would do nothing as there are no | 
 |       // caches to invalidate. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, | 
 |                                          SIAtomicScope Scope, | 
 |                                          SIAtomicAddrSpace AddrSpace, | 
 |                                          bool IsCrossAddrSpaceOrdering, | 
 |                                          Position Pos) const { | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the | 
 |       // hardware does not reorder memory operations by the same wave with | 
 |       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed | 
 |       // to initiate writeback of any dirty cache lines of earlier writes by the | 
 |       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the | 
 |       // writeback has completed. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) | 
 |           // Set SC bits to indicate system scope. | 
 |           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); | 
 |       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is | 
 |       // SIAtomicScope::SYSTEM, the following insertWait will generate the | 
 |       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) | 
 |           // Set SC bits to indicate agent scope. | 
 |           .addImm(AMDGPU::CPol::SC1); | 
 |  | 
 |       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is | 
 |       // SIAtomicScope::AGENT, the following insertWait will generate the | 
 |       // required "S_WAITCNT vmcnt(0)". | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // Do not generate "BUFFER_WBL2" as there are no caches it would | 
 |       // writeback, and would require an otherwise unnecessary | 
 |       // "S_WAITCNT vmcnt(0)". | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other | 
 |   // S_WAITCNT needed. | 
 |   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, | 
 |                         IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx10CacheControl::enableLoadCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, | 
 |     SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set the L0 and L1 cache policies to MISS_EVICT. | 
 |       // Note: there is no L2 cache coherent bypass control at the ISA level. | 
 |       Changed |= enableGLCBit(MI); | 
 |       Changed |= enableDLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In WGP mode the waves of a work-group can be executing on either CU of | 
 |       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in | 
 |       // CU mode all waves of a work-group are on the same CU, and so the L0 | 
 |       // does not need to be bypassed. | 
 |       if (!ST.isCuModeEnabled()) | 
 |         Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |  | 
 |   // Only handle load and store, not atomic read-modify-write insructions. The | 
 |   // latter use glc to indicate if the atomic returns a result and so must not | 
 |   // be used for cache control. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsVolatile) { | 
 |     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions | 
 |     // and MISS_LRU for store instructions. | 
 |     // Note: there is no L2 cache coherent bypass control at the ISA level. | 
 |     if (Op == SIMemOp::LOAD) { | 
 |       Changed |= enableGLCBit(MI); | 
 |       Changed |= enableDLCBit(MI); | 
 |     } | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   if (IsNonTemporal) { | 
 |     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT | 
 |     // and L2 cache policy to STREAM. | 
 |     // For stores setting both GLC and SLC configures L0 and L1 cache policy | 
 |     // to MISS_EVICT and the L2 cache policy to STREAM. | 
 |     if (Op == SIMemOp::STORE) | 
 |       Changed |= enableGLCBit(MI); | 
 |     Changed |= enableSLCBit(MI); | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, | 
 |                                      SIAtomicScope Scope, | 
 |                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                      bool IsCrossAddrSpaceOrdering, | 
 |                                      Position Pos, AtomicOrdering Order) const { | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   bool VMCnt = false; | 
 |   bool VSCnt = false; | 
 |   bool LGKMCnt = false; | 
 |  | 
 |   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != | 
 |       SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) | 
 |         VMCnt |= true; | 
 |       if ((Op & SIMemOp::STORE) != SIMemOp::NONE) | 
 |         VSCnt |= true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In WGP mode the waves of a work-group can be executing on either CU of | 
 |       // the WGP. Therefore need to wait for operations to complete to ensure | 
 |       // they are visible to waves in the other CU as the L0 is per CU. | 
 |       // Otherwise in CU mode and all waves of a work-group are on the same CU | 
 |       // which shares the same L0. | 
 |       if (!ST.isCuModeEnabled()) { | 
 |         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) | 
 |           VMCnt |= true; | 
 |         if ((Op & SIMemOp::STORE) != SIMemOp::NONE) | 
 |           VSCnt |= true; | 
 |       } | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The L0 cache keeps all memory operations in order for | 
 |       // work-items in the same wavefront. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is | 
 |       // not needed as LDS operations for all waves are executed in a total | 
 |       // global ordering as observed by all waves. Required if also | 
 |       // synchronizing with global/GDS memory as LDS operations could be | 
 |       // reordered with respect to later global/GDS memory operations of the | 
 |       // same wave. | 
 |       LGKMCnt |= IsCrossAddrSpaceOrdering; | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The LDS keeps all memory operations in order for | 
 |       // the same wavefront. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" | 
 |       // is not needed as GDS operations for all waves are executed in a total | 
 |       // global ordering as observed by all waves. Required if also | 
 |       // synchronizing with global/LDS memory as GDS operations could be | 
 |       // reordered with respect to later global/LDS memory operations of the | 
 |       // same wave. | 
 |       LGKMCnt |= IsCrossAddrSpaceOrdering; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The GDS keeps all memory operations in order for | 
 |       // the same work-group. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if (VMCnt || LGKMCnt) { | 
 |     unsigned WaitCntImmediate = | 
 |       AMDGPU::encodeWaitcnt(IV, | 
 |                             VMCnt ? 0 : getVmcntBitMask(IV), | 
 |                             getExpcntBitMask(IV), | 
 |                             LGKMCnt ? 0 : getLgkmcntBitMask(IV)); | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) | 
 |         .addImm(WaitCntImmediate); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   // On architectures that support direct loads to LDS, emit an unknown waitcnt | 
 |   // at workgroup-scoped release operations that specify the LDS address space. | 
 |   // SIInsertWaitcnts will later replace this with a vmcnt(). | 
 |   if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | 
 |       Scope == SIAtomicScope::WORKGROUP && | 
 |       (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (VSCnt) { | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) | 
 |         .addReg(AMDGPU::SGPR_NULL, RegState::Undef) | 
 |         .addImm(0); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                         SIAtomicScope Scope, | 
 |                                         SIAtomicAddrSpace AddrSpace, | 
 |                                         Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // The order of invalidates matter here. We must invalidate "outer in" | 
 |       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is | 
 |       // invalidated. | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); | 
 |       Changed = true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In WGP mode the waves of a work-group can be executing on either CU of | 
 |       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise | 
 |       // in CU mode and all waves of a work-group are on the same CU, and so the | 
 |       // L0 does not need to be invalidated. | 
 |       if (!ST.isCuModeEnabled()) { | 
 |         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); | 
 |         Changed = true; | 
 |       } | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to invalidate. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx10CacheControl::insertBarrierStart( | 
 |     MachineBasicBlock::iterator &MI) const { | 
 |   // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU | 
 |   // mode. This is because a CU mode release fence does not emit any wait, which | 
 |   // is fine when only dealing with vmem, but isn't sufficient in the presence | 
 |   // of barriers which do not go through vmem. | 
 |   // GFX12.5 does not require this additional wait. | 
 |   if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) | 
 |     return false; | 
 |  | 
 |   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), | 
 |           TII->get(AMDGPU::S_WAITCNT_DEPCTR)) | 
 |       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); | 
 |   return true; | 
 | } | 
 |  | 
 | bool SIGfx11CacheControl::enableLoadCacheBypass( | 
 |     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, | 
 |     SIAtomicAddrSpace AddrSpace) const { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |       // Set the L0 and L1 cache policies to MISS_EVICT. | 
 |       // Note: there is no L2 cache coherent bypass control at the ISA level. | 
 |       Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In WGP mode the waves of a work-group can be executing on either CU of | 
 |       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in | 
 |       // CU mode all waves of a work-group are on the same CU, and so the L0 | 
 |       // does not need to be bypassed. | 
 |       if (!ST.isCuModeEnabled()) | 
 |         Changed |= enableGLCBit(MI); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   /// The scratch address space does not need the global memory caches | 
 |   /// to be bypassed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |  | 
 |   // Only handle load and store, not atomic read-modify-write insructions. The | 
 |   // latter use glc to indicate if the atomic returns a result and so must not | 
 |   // be used for cache control. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsVolatile) { | 
 |     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions | 
 |     // and MISS_LRU for store instructions. | 
 |     // Note: there is no L2 cache coherent bypass control at the ISA level. | 
 |     if (Op == SIMemOp::LOAD) | 
 |       Changed |= enableGLCBit(MI); | 
 |  | 
 |     // Set MALL NOALLOC for load and store instructions. | 
 |     Changed |= enableDLCBit(MI); | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   if (IsNonTemporal) { | 
 |     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT | 
 |     // and L2 cache policy to STREAM. | 
 |     // For stores setting both GLC and SLC configures L0 and L1 cache policy | 
 |     // to MISS_EVICT and the L2 cache policy to STREAM. | 
 |     if (Op == SIMemOp::STORE) | 
 |       Changed |= enableGLCBit(MI); | 
 |     Changed |= enableSLCBit(MI); | 
 |  | 
 |     // Set MALL NOALLOC for load and store instructions. | 
 |     Changed |= enableDLCBit(MI); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, | 
 |                                 AMDGPU::CPol::CPol Value) const { | 
 |   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); | 
 |   if (!CPol) | 
 |     return false; | 
 |  | 
 |   uint64_t NewTH = Value & AMDGPU::CPol::TH; | 
 |   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { | 
 |     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); | 
 |     return true; | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, | 
 |                                    AMDGPU::CPol::CPol Value) const { | 
 |   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); | 
 |   if (!CPol) | 
 |     return false; | 
 |  | 
 |   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; | 
 |   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { | 
 |     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); | 
 |     return true; | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( | 
 |     const MachineBasicBlock::iterator MI) const { | 
 |   // TODO: implement flag for frontend to give us a hint not to insert waits. | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   const DebugLoc &DL = MI->getDebugLoc(); | 
 |  | 
 |   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); | 
 |   if (ST.hasImageInsts()) { | 
 |     BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); | 
 |     BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); | 
 |   } | 
 |   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); | 
 |   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, | 
 |                                      SIAtomicScope Scope, | 
 |                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |                                      bool IsCrossAddrSpaceOrdering, | 
 |                                      Position Pos, AtomicOrdering Order) const { | 
 |   bool Changed = false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   bool LOADCnt = false; | 
 |   bool DSCnt = false; | 
 |   bool STORECnt = false; | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != | 
 |       SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::CLUSTER: | 
 |       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) | 
 |         LOADCnt |= true; | 
 |       if ((Op & SIMemOp::STORE) != SIMemOp::NONE) | 
 |         STORECnt |= true; | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // GFX12.0: | 
 |       //   In WGP mode the waves of a work-group can be executing on either CU | 
 |       //   of the WGP. Therefore need to wait for operations to complete to | 
 |       //   ensure they are visible to waves in the other CU as the L0 is per CU. | 
 |       //   Otherwise in CU mode and all waves of a work-group are on the same CU | 
 |       //   which shares the same L0. | 
 |       // | 
 |       // GFX12.5: | 
 |       //   TODO DOCS | 
 |       if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { | 
 |         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) | 
 |           LOADCnt |= true; | 
 |         if ((Op & SIMemOp::STORE) != SIMemOp::NONE) | 
 |           STORECnt |= true; | 
 |       } | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The L0 cache keeps all memory operations in order for | 
 |       // work-items in the same wavefront. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |     case SIAtomicScope::AGENT: | 
 |     case SIAtomicScope::CLUSTER: | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is | 
 |       // not needed as LDS operations for all waves are executed in a total | 
 |       // global ordering as observed by all waves. Required if also | 
 |       // synchronizing with global/GDS memory as LDS operations could be | 
 |       // reordered with respect to later global/GDS memory operations of the | 
 |       // same wave. | 
 |       DSCnt |= IsCrossAddrSpaceOrdering; | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // The LDS keeps all memory operations in order for | 
 |       // the same wavefront. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   if (LOADCnt) { | 
 |     // Acquire sequences only need to wait on the previous atomic operation. | 
 |     // e.g. a typical sequence looks like | 
 |     //    atomic load | 
 |     //    (wait) | 
 |     //    global_inv | 
 |     // | 
 |     // We do not have BVH or SAMPLE atomics, so the atomic load is always going | 
 |     // to be tracked using loadcnt. | 
 |     // | 
 |     // This also applies to fences. Fences cannot pair with an instruction | 
 |     // tracked with bvh/samplecnt as we don't have any atomics that do that. | 
 |     if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); | 
 |     } | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (STORECnt) { | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (DSCnt) { | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, | 
 |                                         SIAtomicScope Scope, | 
 |                                         SIAtomicAddrSpace AddrSpace, | 
 |                                         Position Pos) const { | 
 |   if (!InsertCacheInv) | 
 |     return false; | 
 |  | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   /// The scratch address space does not need the global memory cache | 
 |   /// to be flushed as all memory operations by the same thread are | 
 |   /// sequentially consistent, and no other thread can access scratch | 
 |   /// memory. | 
 |  | 
 |   /// Other address spaces do not have a cache. | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) | 
 |     return false; | 
 |  | 
 |   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; | 
 |   switch (Scope) { | 
 |   case SIAtomicScope::SYSTEM: | 
 |     ScopeImm = AMDGPU::CPol::SCOPE_SYS; | 
 |     break; | 
 |   case SIAtomicScope::AGENT: | 
 |     ScopeImm = AMDGPU::CPol::SCOPE_DEV; | 
 |     break; | 
 |   case SIAtomicScope::CLUSTER: | 
 |     ScopeImm = AMDGPU::CPol::SCOPE_SE; | 
 |     break; | 
 |   case SIAtomicScope::WORKGROUP: | 
 |     // GFX12.0: | 
 |     //  In WGP mode the waves of a work-group can be executing on either CU of | 
 |     //  the WGP. Therefore we need to invalidate the L0 which is per CU. | 
 |     //  Otherwise in CU mode all waves of a work-group are on the same CU, and | 
 |     //  so the L0 does not need to be invalidated. | 
 |     // | 
 |     // GFX12.5 | 
 |     //   TODO DOCS | 
 |     if (ST.isCuModeEnabled()) | 
 |       return false; | 
 |  | 
 |     ScopeImm = AMDGPU::CPol::SCOPE_SE; | 
 |     break; | 
 |   case SIAtomicScope::WAVEFRONT: | 
 |   case SIAtomicScope::SINGLETHREAD: | 
 |     // No cache to invalidate. | 
 |     return false; | 
 |   default: | 
 |     llvm_unreachable("Unsupported synchronization scope"); | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, | 
 |                                         SIAtomicScope Scope, | 
 |                                         SIAtomicAddrSpace AddrSpace, | 
 |                                         bool IsCrossAddrSpaceOrdering, | 
 |                                         Position Pos) const { | 
 |   MachineBasicBlock &MBB = *MI->getParent(); | 
 |   DebugLoc DL = MI->getDebugLoc(); | 
 |  | 
 |   // The scratch address space does not need the global memory cache | 
 |   // writeback as all memory operations by the same thread are | 
 |   // sequentially consistent, and no other thread can access scratch | 
 |   // memory. | 
 |  | 
 |   // Other address spaces do not have a cache. | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) | 
 |     return false; | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     ++MI; | 
 |  | 
 |   // global_wb is only necessary at system scope for GFX12.0, | 
 |   // they're also necessary at device scope for GFX12.5. | 
 |   // | 
 |   // Emitting it for lower scopes is a slow no-op, so we omit it | 
 |   // for performance. | 
 |   switch (Scope) { | 
 |   case SIAtomicScope::SYSTEM: | 
 |     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) | 
 |         .addImm(AMDGPU::CPol::SCOPE_SYS); | 
 |     break; | 
 |   case SIAtomicScope::AGENT: | 
 |     // TODO DOCS | 
 |     if (ST.hasGFX1250Insts()) { | 
 |       BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) | 
 |           .addImm(AMDGPU::CPol::SCOPE_DEV); | 
 |     } | 
 |     break; | 
 |   case SIAtomicScope::CLUSTER: | 
 |   case SIAtomicScope::WORKGROUP: | 
 |     // No WB necessary, but we still have to wait. | 
 |     break; | 
 |   case SIAtomicScope::WAVEFRONT: | 
 |   case SIAtomicScope::SINGLETHREAD: | 
 |     // No WB or wait necessary here. | 
 |     return false; | 
 |   default: | 
 |     llvm_unreachable("Unsupported synchronization scope"); | 
 |   } | 
 |  | 
 |   if (Pos == Position::AFTER) | 
 |     --MI; | 
 |  | 
 |   // We always have to wait for previous memory operations (load/store) to | 
 |   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), | 
 |   // we of course need to wait for that as well. | 
 |   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, | 
 |              IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( | 
 |     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, | 
 |     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { | 
 |  | 
 |   // Only handle load and store, not atomic read-modify-write instructions. | 
 |   assert(MI->mayLoad() ^ MI->mayStore()); | 
 |  | 
 |   // Only update load and store, not LLVM IR atomic read-modify-write | 
 |   // instructions. The latter are always marked as volatile so cannot sensibly | 
 |   // handle it as do not want to pessimize all atomics. Also they do not support | 
 |   // the nontemporal attribute. | 
 |   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (IsLastUse) { | 
 |     // Set last-use hint. | 
 |     Changed |= setTH(MI, AMDGPU::CPol::TH_LU); | 
 |   } else if (IsNonTemporal) { | 
 |     // Set non-temporal hint for all cache levels. | 
 |     Changed |= setTH(MI, AMDGPU::CPol::TH_NT); | 
 |   } | 
 |  | 
 |   if (IsVolatile) { | 
 |     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); | 
 |  | 
 |     // Ensure operation has completed at system scope to cause all volatile | 
 |     // operations to be visible outside the program in a global order. Do not | 
 |     // request cross address space as only the global address space can be | 
 |     // observable outside the program, so no need to cause a waitcnt for LDS | 
 |     // address space operations. | 
 |     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, | 
 |                           Position::AFTER, AtomicOrdering::Unordered); | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { | 
 |   assert(MI.mayStore() && "Not a Store inst"); | 
 |   const bool IsRMW = (MI.mayLoad() && MI.mayStore()); | 
 |   bool Changed = false; | 
 |  | 
 |   // GFX12.5 only: xcnt wait is needed before flat and global atomics | 
 |   // stores/rmw. | 
 |   if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { | 
 |     MachineBasicBlock &MBB = *MI.getParent(); | 
 |     BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); | 
 |     Changed = true; | 
 |   } | 
 |  | 
 |   // Remaining fixes do not apply to RMWs. | 
 |   if (IsRMW) | 
 |     return Changed; | 
 |  | 
 |   MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); | 
 |   if (!CPol) // Some vmem operations do not have a scope and are not concerned. | 
 |     return Changed; | 
 |   const unsigned Scope = CPol->getImm() & CPol::SCOPE; | 
 |  | 
 |   // GFX12.0 only: Extra waits needed before system scope stores. | 
 |   if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) | 
 |     Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const { | 
 |   if (!ST.hasGFX1250Insts()) | 
 |     return false; | 
 |  | 
 |   // Cooperative atomics need to be SCOPE_DEV or higher. | 
 |   MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); | 
 |   assert(CPol && "No CPol operand?"); | 
 |   const unsigned Scope = CPol->getImm() & CPol::SCOPE; | 
 |   if (Scope < CPol::SCOPE_DEV) | 
 |     return setScope(MI, CPol::SCOPE_DEV); | 
 |   return false; | 
 | } | 
 |  | 
 | bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, | 
 |                                          SIAtomicScope Scope, | 
 |                                          SIAtomicAddrSpace AddrSpace) const { | 
 |   bool Changed = false; | 
 |  | 
 |   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { | 
 |     switch (Scope) { | 
 |     case SIAtomicScope::SYSTEM: | 
 |       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); | 
 |       break; | 
 |     case SIAtomicScope::AGENT: | 
 |       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); | 
 |       break; | 
 |     case SIAtomicScope::CLUSTER: | 
 |       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); | 
 |       break; | 
 |     case SIAtomicScope::WORKGROUP: | 
 |       // In workgroup mode, SCOPE_SE is needed as waves can executes on | 
 |       // different CUs that access different L0s. | 
 |       if (!ST.isCuModeEnabled()) | 
 |         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); | 
 |       break; | 
 |     case SIAtomicScope::WAVEFRONT: | 
 |     case SIAtomicScope::SINGLETHREAD: | 
 |       // No cache to bypass. | 
 |       break; | 
 |     default: | 
 |       llvm_unreachable("Unsupported synchronization scope"); | 
 |     } | 
 |   } | 
 |  | 
 |   // The scratch address space does not need the global memory caches | 
 |   // to be bypassed as all memory operations by the same thread are | 
 |   // sequentially consistent, and no other thread can access scratch | 
 |   // memory. | 
 |  | 
 |   // Other address spaces do not have a cache. | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::removeAtomicPseudoMIs() { | 
 |   if (AtomicPseudoMIs.empty()) | 
 |     return false; | 
 |  | 
 |   for (auto &MI : AtomicPseudoMIs) | 
 |     MI->eraseFromParent(); | 
 |  | 
 |   AtomicPseudoMIs.clear(); | 
 |   return true; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, | 
 |                                    MachineBasicBlock::iterator &MI) { | 
 |   assert(MI->mayLoad() && !MI->mayStore()); | 
 |  | 
 |   bool Changed = false; | 
 |  | 
 |   if (MOI.isAtomic()) { | 
 |     const AtomicOrdering Order = MOI.getOrdering(); | 
 |     if (Order == AtomicOrdering::Monotonic || | 
 |         Order == AtomicOrdering::Acquire || | 
 |         Order == AtomicOrdering::SequentiallyConsistent) { | 
 |       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), | 
 |                                            MOI.getOrderingAddrSpace()); | 
 |     } | 
 |  | 
 |     // Handle cooperative atomics after cache bypass step, as it may override | 
 |     // the scope of the instruction to a greater scope. | 
 |     if (MOI.isCooperative()) | 
 |       Changed |= CC->handleCooperativeAtomic(*MI); | 
 |  | 
 |     if (Order == AtomicOrdering::SequentiallyConsistent) | 
 |       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), | 
 |                                 SIMemOp::LOAD | SIMemOp::STORE, | 
 |                                 MOI.getIsCrossAddressSpaceOrdering(), | 
 |                                 Position::BEFORE, Order); | 
 |  | 
 |     if (Order == AtomicOrdering::Acquire || | 
 |         Order == AtomicOrdering::SequentiallyConsistent) { | 
 |       Changed |= CC->insertWait( | 
 |           MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, | 
 |           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); | 
 |       Changed |= CC->insertAcquire(MI, MOI.getScope(), | 
 |                                    MOI.getOrderingAddrSpace(), | 
 |                                    Position::AFTER); | 
 |     } | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   // Atomic instructions already bypass caches to the scope specified by the | 
 |   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use | 
 |   // instructions need additional treatment. | 
 |   Changed |= CC->enableVolatileAndOrNonTemporal( | 
 |       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), | 
 |       MOI.isNonTemporal(), MOI.isLastUse()); | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, | 
 |                                     MachineBasicBlock::iterator &MI) { | 
 |   assert(!MI->mayLoad() && MI->mayStore()); | 
 |  | 
 |   bool Changed = false; | 
 |   // FIXME: Necessary hack because iterator can lose track of the store. | 
 |   MachineInstr &StoreMI = *MI; | 
 |  | 
 |   if (MOI.isAtomic()) { | 
 |     if (MOI.getOrdering() == AtomicOrdering::Monotonic || | 
 |         MOI.getOrdering() == AtomicOrdering::Release || | 
 |         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { | 
 |       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), | 
 |                                             MOI.getOrderingAddrSpace()); | 
 |     } | 
 |  | 
 |     // Handle cooperative atomics after cache bypass step, as it may override | 
 |     // the scope of the instruction to a greater scope. | 
 |     if (MOI.isCooperative()) | 
 |       Changed |= CC->handleCooperativeAtomic(*MI); | 
 |  | 
 |     if (MOI.getOrdering() == AtomicOrdering::Release || | 
 |         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) | 
 |       Changed |= CC->insertRelease(MI, MOI.getScope(), | 
 |                                    MOI.getOrderingAddrSpace(), | 
 |                                    MOI.getIsCrossAddressSpaceOrdering(), | 
 |                                    Position::BEFORE); | 
 |  | 
 |     Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   // Atomic instructions already bypass caches to the scope specified by the | 
 |   // SyncScope operand. Only non-atomic volatile and nontemporal instructions | 
 |   // need additional treatment. | 
 |   Changed |= CC->enableVolatileAndOrNonTemporal( | 
 |       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), | 
 |       MOI.isNonTemporal()); | 
 |  | 
 |   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is | 
 |   // instruction field, do not confuse it with atomic scope. | 
 |   Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, | 
 |                                           MachineBasicBlock::iterator &MI) { | 
 |   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); | 
 |  | 
 |   AtomicPseudoMIs.push_back(MI); | 
 |   bool Changed = false; | 
 |  | 
 |   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace(); | 
 |  | 
 |   if (MOI.isAtomic()) { | 
 |     const AtomicOrdering Order = MOI.getOrdering(); | 
 |     if (Order == AtomicOrdering::Acquire) { | 
 |       Changed |= CC->insertWait( | 
 |           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, | 
 |           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order); | 
 |     } | 
 |  | 
 |     if (Order == AtomicOrdering::Release || | 
 |         Order == AtomicOrdering::AcquireRelease || | 
 |         Order == AtomicOrdering::SequentiallyConsistent) | 
 |       /// TODO: This relies on a barrier always generating a waitcnt | 
 |       /// for LDS to ensure it is not reordered with the completion of | 
 |       /// the proceeding LDS operations. If barrier had a memory | 
 |       /// ordering and memory scope, then library does not need to | 
 |       /// generate a fence. Could add support in this file for | 
 |       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally | 
 |       /// adding S_WAITCNT before a S_BARRIER. | 
 |       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, | 
 |                                    MOI.getIsCrossAddressSpaceOrdering(), | 
 |                                    Position::BEFORE); | 
 |  | 
 |     // TODO: If both release and invalidate are happening they could be combined | 
 |     // to use the single "BUFFER_WBINV*" instruction. This could be done by | 
 |     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to | 
 |     // track cache invalidate and write back instructions. | 
 |  | 
 |     if (Order == AtomicOrdering::Acquire || | 
 |         Order == AtomicOrdering::AcquireRelease || | 
 |         Order == AtomicOrdering::SequentiallyConsistent) | 
 |       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, | 
 |                                    Position::BEFORE); | 
 |  | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, | 
 |   MachineBasicBlock::iterator &MI) { | 
 |   assert(MI->mayLoad() && MI->mayStore()); | 
 |  | 
 |   bool Changed = false; | 
 |   MachineInstr &RMWMI = *MI; | 
 |  | 
 |   if (MOI.isAtomic()) { | 
 |     const AtomicOrdering Order = MOI.getOrdering(); | 
 |     if (Order == AtomicOrdering::Monotonic || | 
 |         Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release || | 
 |         Order == AtomicOrdering::AcquireRelease || | 
 |         Order == AtomicOrdering::SequentiallyConsistent) { | 
 |       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), | 
 |                                           MOI.getInstrAddrSpace()); | 
 |     } | 
 |  | 
 |     if (Order == AtomicOrdering::Release || | 
 |         Order == AtomicOrdering::AcquireRelease || | 
 |         Order == AtomicOrdering::SequentiallyConsistent || | 
 |         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) | 
 |       Changed |= CC->insertRelease(MI, MOI.getScope(), | 
 |                                    MOI.getOrderingAddrSpace(), | 
 |                                    MOI.getIsCrossAddressSpaceOrdering(), | 
 |                                    Position::BEFORE); | 
 |  | 
 |     if (Order == AtomicOrdering::Acquire || | 
 |         Order == AtomicOrdering::AcquireRelease || | 
 |         Order == AtomicOrdering::SequentiallyConsistent || | 
 |         MOI.getFailureOrdering() == AtomicOrdering::Acquire || | 
 |         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { | 
 |       Changed |= CC->insertWait( | 
 |           MI, MOI.getScope(), MOI.getInstrAddrSpace(), | 
 |           isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, | 
 |           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); | 
 |       Changed |= CC->insertAcquire(MI, MOI.getScope(), | 
 |                                    MOI.getOrderingAddrSpace(), | 
 |                                    Position::AFTER); | 
 |     } | 
 |  | 
 |     Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); | 
 |     return Changed; | 
 |   } | 
 |  | 
 |   return Changed; | 
 | } | 
 |  | 
 | bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { | 
 |   const MachineModuleInfo &MMI = | 
 |       getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); | 
 |   return SIMemoryLegalizer(MMI).run(MF); | 
 | } | 
 |  | 
 | PreservedAnalyses | 
 | SIMemoryLegalizerPass::run(MachineFunction &MF, | 
 |                            MachineFunctionAnalysisManager &MFAM) { | 
 |   auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF) | 
 |                   .getCachedResult<MachineModuleAnalysis>( | 
 |                       *MF.getFunction().getParent()); | 
 |   assert(MMI && "MachineModuleAnalysis must be available"); | 
 |   if (!SIMemoryLegalizer(MMI->getMMI()).run(MF)) | 
 |     return PreservedAnalyses::all(); | 
 |   return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); | 
 | } | 
 |  | 
 | bool SIMemoryLegalizer::run(MachineFunction &MF) { | 
 |   bool Changed = false; | 
 |  | 
 |   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | 
 |   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST); | 
 |   CC = SICacheControl::create(ST); | 
 |  | 
 |   for (auto &MBB : MF) { | 
 |     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { | 
 |  | 
 |       // Unbundle instructions after the post-RA scheduler. | 
 |       if (MI->isBundle() && MI->mayLoadOrStore()) { | 
 |         MachineBasicBlock::instr_iterator II(MI->getIterator()); | 
 |         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); | 
 |              I != E && I->isBundledWithPred(); ++I) { | 
 |           I->unbundleFromPred(); | 
 |           for (MachineOperand &MO : I->operands()) | 
 |             if (MO.isReg()) | 
 |               MO.setIsInternalRead(false); | 
 |         } | 
 |  | 
 |         MI->eraseFromParent(); | 
 |         MI = II->getIterator(); | 
 |       } | 
 |  | 
 |       if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { | 
 |         Changed |= CC->insertBarrierStart(MI); | 
 |         continue; | 
 |       } | 
 |  | 
 |       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) | 
 |         continue; | 
 |  | 
 |       if (const auto &MOI = MOA.getLoadInfo(MI)) | 
 |         Changed |= expandLoad(*MOI, MI); | 
 |       else if (const auto &MOI = MOA.getStoreInfo(MI)) { | 
 |         Changed |= expandStore(*MOI, MI); | 
 |       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) | 
 |         Changed |= expandAtomicFence(*MOI, MI); | 
 |       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) | 
 |         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); | 
 |     } | 
 |   } | 
 |  | 
 |   Changed |= removeAtomicPseudoMIs(); | 
 |   return Changed; | 
 | } | 
 |  | 
 | INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false) | 
 |  | 
 | char SIMemoryLegalizerLegacy::ID = 0; | 
 | char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID; | 
 |  | 
 | FunctionPass *llvm::createSIMemoryLegalizerPass() { | 
 |   return new SIMemoryLegalizerLegacy(); | 
 | } |