llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp - llvm-project - Git at Google

 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Memory legalizer - implements memory model. More information can be
 /// found here:
 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUMachineModuleInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/TargetParser.h"

 using namespace llvm;
 using namespace llvm::AMDGPU;

 #define DEBUG_TYPE "si-memory-legalizer"
 #define PASS_NAME "SI Memory Legalizer"

 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
     cl::desc("Use this to skip inserting cache invalidating instructions."));

 namespace {

 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();

 /// Memory operation flags. Can be ORed together.
 enum class SIMemOp {
   NONE = 0u,
   LOAD = 1u << 0,
   STORE = 1u << 1,
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
 };

 /// Position to insert a new instruction relative to an existing
 /// instruction.
 enum class Position {
   BEFORE,
   AFTER
 };

 /// The atomic synchronization scopes supported by the AMDGPU target.
 enum class SIAtomicScope {
   NONE,
   SINGLETHREAD,
   WAVEFRONT,
   WORKGROUP,
   CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
   AGENT,
   SYSTEM
 };

 /// The distinct address spaces supported by the AMDGPU target for
 /// atomic memory operation. Can be ORed together.
 enum class SIAtomicAddrSpace {
   NONE = 0u,
   GLOBAL = 1u << 0,
   LDS = 1u << 1,
   SCRATCH = 1u << 2,
   GDS = 1u << 3,
   OTHER = 1u << 4,

   /// The address spaces that can be accessed by a FLAT instruction.
   FLAT = GLOBAL | LDS | SCRATCH,

   /// The address spaces that support atomic instructions.
   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,

   /// All address spaces.
   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,

   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };

 class SIMemOpInfo final {
 private:

   friend class SIMemOpAccess;

   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
   bool IsVolatile = false;
   bool IsNonTemporal = false;
   bool IsLastUse = false;
   bool IsCooperative = false;

   SIMemOpInfo(
       const GCNSubtarget &ST,
       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
       bool IsLastUse = false, bool IsCooperative = false)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
         IsLastUse(IsLastUse), IsCooperative(IsCooperative) {

     if (Ordering == AtomicOrdering::NotAtomic) {
       assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
       assert(Scope == SIAtomicScope::NONE &&
              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
              !IsCrossAddressSpaceOrdering &&
              FailureOrdering == AtomicOrdering::NotAtomic);
       return;
     }

     assert(Scope != SIAtomicScope::NONE &&
            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
                SIAtomicAddrSpace::NONE &&
            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
                SIAtomicAddrSpace::NONE);

     // There is also no cross address space ordering if the ordering
     // address space is the same as the instruction address space and
     // only contains a single address space.
     if ((OrderingAddrSpace == InstrAddrSpace) &&
         isPowerOf2_32(uint32_t(InstrAddrSpace)))
       this->IsCrossAddressSpaceOrdering = false;

     // Limit the scope to the maximum supported by the instruction's address
     // spaces.
     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
         SIAtomicAddrSpace::NONE) {
       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
     } else if ((InstrAddrSpace &
                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
                SIAtomicAddrSpace::NONE) {
       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
     } else if ((InstrAddrSpace &
                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
     }

     // On targets that have no concept of a workgroup cluster, use
     // AGENT scope as a conservatively correct alternative.
     if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
       this->Scope = SIAtomicScope::AGENT;
   }

 public:
   /// \returns Atomic synchronization scope of the machine instruction used to
   /// create this SIMemOpInfo.
   SIAtomicScope getScope() const {
     return Scope;
   }

   /// \returns Ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getOrdering() const {
     return Ordering;
   }

   /// \returns Failure ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }

   /// \returns The address spaces be accessed by the machine
   /// instruction used to create this SIMemOpInfo.
   SIAtomicAddrSpace getInstrAddrSpace() const {
     return InstrAddrSpace;
   }

   /// \returns The address spaces that must be ordered by the machine
   /// instruction used to create this SIMemOpInfo.
   SIAtomicAddrSpace getOrderingAddrSpace() const {
     return OrderingAddrSpace;
   }

   /// \returns Return true iff memory ordering of operations on
   /// different address spaces is required.
   bool getIsCrossAddressSpaceOrdering() const {
     return IsCrossAddressSpaceOrdering;
   }

   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is volatile, false otherwise.
   bool isVolatile() const {
     return IsVolatile;
   }

   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is nontemporal, false otherwise.
   bool isNonTemporal() const {
     return IsNonTemporal;
   }

   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is last use, false otherwise.
   bool isLastUse() const { return IsLastUse; }

   /// \returns True if this is a cooperative load or store atomic.
   bool isCooperative() const { return IsCooperative; }

   /// \returns True if ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo is unordered or higher, false otherwise.
   bool isAtomic() const {
     return Ordering != AtomicOrdering::NotAtomic;
   }

 };

 class SIMemOpAccess final {
 private:
   const AMDGPUMachineModuleInfo *MMI = nullptr;
   const GCNSubtarget &ST;

   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
                          const char *Msg) const;

   /// Inspects the target synchronization scope \p SSID and determines
   /// the SI atomic scope it corresponds to, the address spaces it
   /// covers, and whether the memory ordering applies between address
   /// spaces.
   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;

   /// \return Return a bit set of the address spaces accessed by \p AS.
   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;

   /// \returns Info constructed from \p MI, which has at least machine memory
   /// operand.
   std::optional<SIMemOpInfo>
   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;

 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);

   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getLoadInfo(const MachineBasicBlock::iterator &MI) const;

   /// \returns Store info if \p MI is a store operation, "std::nullopt"
   /// otherwise.
   std::optional<SIMemOpInfo>
   getStoreInfo(const MachineBasicBlock::iterator &MI) const;

   /// \returns Atomic fence info if \p MI is an atomic fence operation,
   /// "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;

   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
   /// rmw operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
 };

 class SICacheControl {
 protected:

   /// AMDGPU subtarget info.
   const GCNSubtarget &ST;

   /// Instruction info.
   const SIInstrInfo *TII = nullptr;

   IsaVersion IV;

   /// Whether to insert cache invalidating instructions.
   bool InsertCacheInv;

   SICacheControl(const GCNSubtarget &ST);

   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
   /// \returns Returns true if \p MI is modified, false otherwise.
   bool enableNamedBit(const MachineBasicBlock::iterator MI,
                       AMDGPU::CPol::CPol Bit) const;

 public:

   /// Create a cache control for the subtarget \p ST.
   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);

   /// Update \p MI memory load instruction to bypass any caches up to
   /// the \p Scope memory scope for address spaces \p
   /// AddrSpace. Return true iff the instruction was modified.
   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace) const = 0;

   /// Update \p MI memory store instruction to bypass any caches up to
   /// the \p Scope memory scope for address spaces \p
   /// AddrSpace. Return true iff the instruction was modified.
   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
                                       SIAtomicScope Scope,
                                       SIAtomicAddrSpace AddrSpace) const = 0;

   /// Update \p MI memory read-modify-write instruction to bypass any caches up
   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
   /// iff the instruction was modified.
   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                                     SIAtomicScope Scope,
                                     SIAtomicAddrSpace AddrSpace) const = 0;

   /// Update \p MI memory instruction of kind \p Op associated with address
   /// spaces \p AddrSpace to indicate it is volatile and/or
   /// nontemporal/last-use. Return true iff the instruction was modified.
   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                               SIAtomicAddrSpace AddrSpace,
                                               SIMemOp Op, bool IsVolatile,
                                               bool IsNonTemporal,
                                               bool IsLastUse = false) const = 0;

   virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
     return false;
   };

   /// Handle cooperative load/store atomics.
   virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
     llvm_unreachable(
         "cooperative atomics are not available on this architecture");
   }

   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
   /// between memory instructions to enforce the order they become visible as
   /// observed by other memory instructions executing in memory scope \p Scope.
   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
   /// address spaces. Returns true iff any instructions inserted.
   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                           SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                           bool IsCrossAddrSpaceOrdering, Position Pos,
                           AtomicOrdering Order) const = 0;

   /// Inserts any necessary instructions at position \p Pos relative to
   /// instruction \p MI to ensure any subsequent memory instructions of this
   /// thread with address spaces \p AddrSpace will observe the previous memory
   /// operations by any thread for memory scopes up to memory scope \p Scope .
   /// Returns true iff any instructions inserted.
   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace,
                              Position Pos) const = 0;

   /// Inserts any necessary instructions at position \p Pos relative to
   /// instruction \p MI to ensure previous memory instructions by this thread
   /// with address spaces \p AddrSpace have completed and can be observed by
   /// subsequent memory instructions by any thread executing in memory scope \p
   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
   /// between address spaces. Returns true iff any instructions inserted.
   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace,
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;

   /// Inserts any necessary instructions before the barrier start instruction
   /// \p MI in order to support pairing of barriers and fences.
   virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
     return false;
   };

   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };

 class SIGfx6CacheControl : public SICacheControl {
 protected:

   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::GLC);
   }

   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::SLC);
   }

 public:

   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;

   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
                               SIAtomicScope Scope,
                               SIAtomicAddrSpace AddrSpace) const override;

   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;

   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
                   AtomicOrdering Order) const override;

   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;

   bool insertRelease(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;
 };

 class SIGfx7CacheControl : public SIGfx6CacheControl {
 public:

   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}

   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;

 };

 class SIGfx90ACacheControl : public SIGfx7CacheControl {
 public:

   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;

   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
                               SIAtomicScope Scope,
                               SIAtomicAddrSpace AddrSpace) const override;

   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;

   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
                   AtomicOrdering Order) const override;

   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;

   bool insertRelease(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;
 };

 class SIGfx940CacheControl : public SIGfx90ACacheControl {
 protected:

   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::SC0);
   }

   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::SC1);
   }

   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::NT);
   }

 public:
   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;

   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
                               SIAtomicScope Scope,
                               SIAtomicAddrSpace AddrSpace) const override;

   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;

   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;

   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;
 };

 class SIGfx10CacheControl : public SIGfx7CacheControl {
 protected:

   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit(MI, AMDGPU::CPol::DLC);
   }

 public:

   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;

   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
                   AtomicOrdering Order) const override;

   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;

   bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
 };

 class SIGfx11CacheControl : public SIGfx10CacheControl {
 public:
   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;
 };

 class SIGfx12CacheControl : public SIGfx11CacheControl {
 protected:
   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
   // \returns Returns true if \p MI is modified, false otherwise.
   bool setTH(const MachineBasicBlock::iterator MI,
              AMDGPU::CPol::CPol Value) const;
   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
   // MI. \returns Returns true if \p MI is modified, false otherwise.
   bool setScope(const MachineBasicBlock::iterator MI,
                 AMDGPU::CPol::CPol Value) const;

   // Stores with system scope (SCOPE_SYS) need to wait for:
   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
   // - non-returning-atomics       - wait for STORECNT==0
   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
   //   since it does not distinguish atomics-with-return from regular stores.
   // There is no need to wait if memory is cached (mtype != UC).
   bool
   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;

   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;

 public:
   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
     // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
     // the behavior is the same if assuming GFX12.0 in CU mode.
     assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
   }

   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
                   AtomicOrdering Order) const override;

   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;

   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;

   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;

   virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;

   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;

   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override {
     return setAtomicScope(MI, Scope, AddrSpace);
   }

   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
                               SIAtomicScope Scope,
                               SIAtomicAddrSpace AddrSpace) const override {
     return setAtomicScope(MI, Scope, AddrSpace);
   }

   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override {
     return setAtomicScope(MI, Scope, AddrSpace);
   }
 };

 class SIMemoryLegalizer final {
 private:
   const MachineModuleInfo &MMI;
   /// Cache Control.
   std::unique_ptr<SICacheControl> CC = nullptr;

   /// List of atomic pseudo instructions.
   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;

   /// Return true iff instruction \p MI is a atomic instruction that
   /// returns a result.
   bool isAtomicRet(const MachineInstr &MI) const {
     return SIInstrInfo::isAtomicRet(MI);
   }

   /// Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
   bool removeAtomicPseudoMIs();

   /// Expands load operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandLoad(const SIMemOpInfo &MOI,
                   MachineBasicBlock::iterator &MI);
   /// Expands store operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandStore(const SIMemOpInfo &MOI,
                    MachineBasicBlock::iterator &MI);
   /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicFence(const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                 MachineBasicBlock::iterator &MI);

 public:
   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
   bool run(MachineFunction &MF);
 };

 class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
 public:
   static char ID;

   SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }

   StringRef getPassName() const override {
     return PASS_NAME;
   }

   bool runOnMachineFunction(MachineFunction &MF) override;
 };

 static const StringMap<SIAtomicAddrSpace> ASNames = {{
     {"global", SIAtomicAddrSpace::GLOBAL},
     {"local", SIAtomicAddrSpace::LDS},
 }};

 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
   const MachineFunction *MF = MI.getMF();
   const Function &Fn = MF->getFunction();
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
   OS << "unknown address space '" << AS << "'; expected one of ";
   ListSeparator LS;
   for (const auto &[Name, Val] : ASNames)
     OS << LS << '\'' << Name << '\'';
   Fn.getContext().diagnose(
       DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
 }

 /// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
 /// If this tag isn't present, or if it has no meaningful values, returns
 /// \p none, otherwise returns the address spaces specified by the MD.
 static std::optional<SIAtomicAddrSpace>
 getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
   static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";

   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
   if (!MMRA)
     return std::nullopt;

   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
   for (const auto &[Prefix, Suffix] : MMRA) {
     if (Prefix != FenceASPrefix)
       continue;

     if (auto It = ASNames.find(Suffix); It != ASNames.end())
       Result |= It->second;
     else
       diagnoseUnknownMMRAASName(MI, Suffix);
   }

   if (Result == SIAtomicAddrSpace::NONE)
     return std::nullopt;

   return Result;
 }

 } // end anonymous namespace

 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
                                       const char *Msg) const {
   const Function &Func = MI->getParent()->getParent()->getFunction();
   Func.getContext().diagnose(
       DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
 }

 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
                                SIAtomicAddrSpace InstrAddrSpace) const {
   if (SSID == SyncScope::System)
     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getAgentSSID())
     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getClusterSSID())
     return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getWorkgroupSSID())
     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
                       true);
   if (SSID == MMI->getWavefrontSSID())
     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
                       true);
   if (SSID == SyncScope::SingleThread)
     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
                       true);
   if (SSID == MMI->getSystemOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::SYSTEM,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getAgentOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::AGENT,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getClusterOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::CLUSTER,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::WORKGROUP,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::WAVEFRONT,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::SINGLETHREAD,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   return std::nullopt;
 }

 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   if (AS == AMDGPUAS::FLAT_ADDRESS)
     return SIAtomicAddrSpace::FLAT;
   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
     return SIAtomicAddrSpace::GLOBAL;
   if (AS == AMDGPUAS::LOCAL_ADDRESS)
     return SIAtomicAddrSpace::LDS;
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
     return SIAtomicAddrSpace::SCRATCH;
   if (AS == AMDGPUAS::REGION_ADDRESS)
     return SIAtomicAddrSpace::GDS;

   return SIAtomicAddrSpace::OTHER;
 }

 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
                              const GCNSubtarget &ST)
     : MMI(&MMI_), ST(ST) {}

 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
   assert(MI->getNumMemOperands() > 0);

   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
   bool IsVolatile = false;
   bool IsLastUse = false;
   bool IsCooperative = false;

   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
     IsNonTemporal &= MMO->isNonTemporal();
     IsVolatile |= MMO->isVolatile();
     IsLastUse |= MMO->getFlags() & MOLastUse;
     IsCooperative |= MMO->getFlags() & MOCooperative;
     InstrAddrSpace |=
       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
     if (OpOrdering != AtomicOrdering::NotAtomic) {
       const auto &IsSyncScopeInclusion =
           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
       if (!IsSyncScopeInclusion) {
         reportUnsupported(MI,
           "Unsupported non-inclusive atomic synchronization scope");
         return std::nullopt;
       }

       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
       FailureOrdering =
           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
     }
   }

   SIAtomicScope Scope = SIAtomicScope::NONE;
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
   if (Ordering != AtomicOrdering::NotAtomic) {
     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
     if (!ScopeOrNone) {
       reportUnsupported(MI, "Unsupported atomic synchronization scope");
       return std::nullopt;
     }
     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
         *ScopeOrNone;
     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
       reportUnsupported(MI, "Unsupported atomic address space");
       return std::nullopt;
     }
   }
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
                      IsNonTemporal, IsLastUse, IsCooperative);
 }

 std::optional<SIMemOpInfo>
 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);

   if (!(MI->mayLoad() && !MI->mayStore()))
     return std::nullopt;

   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
     return SIMemOpInfo(ST);

   return constructFromMIWithMMO(MI);
 }

 std::optional<SIMemOpInfo>
 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);

   if (!(!MI->mayLoad() && MI->mayStore()))
     return std::nullopt;

   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
     return SIMemOpInfo(ST);

   return constructFromMIWithMMO(MI);
 }

 std::optional<SIMemOpInfo>
 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);

   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
     return std::nullopt;

   AtomicOrdering Ordering =
     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());

   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
   if (!ScopeOrNone) {
     reportUnsupported(MI, "Unsupported atomic synchronization scope");
     return std::nullopt;
   }

   SIAtomicScope Scope = SIAtomicScope::NONE;
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
       *ScopeOrNone;

   if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
     // We currently expect refineOrderingAS to be the only place that
     // can refine the AS ordered by the fence.
     // If that changes, we need to review the semantics of that function
     // in case it needs to preserve certain address spaces.
     reportUnsupported(MI, "Unsupported atomic address space");
     return std::nullopt;
   }

   auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
   if (SynchronizeAS)
     OrderingAddrSpace = *SynchronizeAS;

   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
                      AtomicOrdering::NotAtomic);
 }

 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
     const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);

   if (!(MI->mayLoad() && MI->mayStore()))
     return std::nullopt;

   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
     return SIMemOpInfo(ST);

   return constructFromMIWithMMO(MI);
 }

 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
 }

 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
                                     AMDGPU::CPol::CPol Bit) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
   if (!CPol)
     return false;

   CPol->setImm(CPol->getImm() | Bit);
   return true;
 }

 /* static */
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
   if (ST.hasGFX940Insts())
     return std::make_unique<SIGfx940CacheControl>(ST);
   if (ST.hasGFX90AInsts())
     return std::make_unique<SIGfx90ACacheControl>(ST);
   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
     return std::make_unique<SIGfx7CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX11)
     return std::make_unique<SIGfx10CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX12)
     return std::make_unique<SIGfx11CacheControl>(ST);
   return std::make_unique<SIGfx12CacheControl>(ST);
 }

 bool SIGfx6CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // Set L1 cache policy to MISS_EVICT.
       // Note: there is no L2 cache bypass policy at the ISA level.
       Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx6CacheControl::enableStoreCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(!MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   /// The L1 cache is write through so does not need to be bypassed. There is no
   /// bypass control for the L2 cache at the isa level.

   return Changed;
 }

 bool SIGfx6CacheControl::enableRMWCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
   /// bypassed, and the GLC bit is instead used to indicate if they are
   /// return or no-return.
   /// Note: there is no L2 cache coherent bypass control at the ISA level.

   return Changed;
 }

 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsVolatile) {
     // Set L1 cache policy to be MISS_EVICT for load instructions
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache bypass policy at the ISA level.
     if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);

     return Changed;
   }

   if (IsNonTemporal) {
     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
     // for both loads and stores, and the L2 cache policy to STREAM.
     Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);
     return Changed;
   }

   return Changed;
 }

 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicScope Scope,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
                                     AtomicOrdering Order) const {
   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   bool VMCnt = false;
   bool LGKMCnt = false;

   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
       SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       VMCnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The L1 cache keeps all memory operations in order for
       // wavefronts in the same work-group.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
       // not needed as LDS operations for all waves are executed in a total
       // global ordering as observed by all waves. Required if also
       // synchronizing with global/GDS memory as LDS operations could be
       // reordered with respect to later global/GDS memory operations of the
       // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
       // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
       // is not needed as GDS operations for all waves are executed in a total
       // global ordering as observed by all waves. Required if also
       // synchronizing with global/LDS memory as GDS operations could be
       // reordered with respect to later global/LDS memory operations of the
       // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The GDS keeps all memory operations in order for
       // the same work-group.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if (VMCnt || LGKMCnt) {
     unsigned WaitCntImmediate =
       AMDGPU::encodeWaitcnt(IV,
                             VMCnt ? 0 : getVmcntBitMask(IV),
                             getExpcntBitMask(IV),
                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
         .addImm(WaitCntImmediate);
     Changed = true;
   }

   // On architectures that support direct loads to LDS, emit an unknown waitcnt
   // at workgroup-scoped release operations that specify the LDS address space.
   // SIInsertWaitcnts will later replace this with a vmcnt().
   if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
       Scope == SIAtomicScope::WORKGROUP &&
       (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
     Changed = true;
   }

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                        SIAtomicScope Scope,
                                        SIAtomicAddrSpace AddrSpace,
                                        Position Pos) const {
   if (!InsertCacheInv)
     return false;

   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                        SIAtomicScope Scope,
                                        SIAtomicAddrSpace AddrSpace,
                                        bool IsCrossAddrSpaceOrdering,
                                        Position Pos) const {
   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
                     IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
 }

 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                        SIAtomicScope Scope,
                                        SIAtomicAddrSpace AddrSpace,
                                        Position Pos) const {
   if (!InsertCacheInv)
     return false;

   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();

   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
                                     ? AMDGPU::BUFFER_WBINVL1
                                     : AMDGPU::BUFFER_WBINVL1_VOL;

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx90ACacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // Set the L1 cache policy to MISS_LRU.
       // Note: there is no L2 cache bypass policy at the ISA level.
       Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
       // In threadgroup split mode the waves of a work-group can be executing on
       // different CUs. Therefore need to bypass the L1 which is per CU.
       // Otherwise in non-threadgroup split mode all waves of a work-group are
       // on the same CU, and so the L1 does not need to be bypassed.
       if (ST.isTgSplitEnabled())
         Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx90ACacheControl::enableStoreCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(!MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       /// Do not set glc for store atomic operations as they implicitly write
       /// through the L1 cache.
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass. Store atomics implicitly write through the L1
       // cache.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx90ACacheControl::enableRMWCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       /// Do not set glc for RMW atomic operations as they implicitly bypass
       /// the L1 cache, and the glc bit is instead used to indicate if they are
       /// return or no-return.
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   return Changed;
 }

 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsVolatile) {
     // Set L1 cache policy to be MISS_EVICT for load instructions
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache bypass policy at the ISA level.
     if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);

     return Changed;
   }

   if (IsNonTemporal) {
     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
     // for both loads and stores, and the L2 cache policy to STREAM.
     Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);
     return Changed;
   }

   return Changed;
 }

 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                       SIAtomicScope Scope,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsCrossAddrSpaceOrdering,
                                       Position Pos,
                                       AtomicOrdering Order) const {
   if (ST.isTgSplitEnabled()) {
     // In threadgroup split mode the waves of a work-group can be executing on
     // different CUs. Therefore need to wait for global or GDS memory operations
     // to complete to ensure they are visible to waves in the other CUs.
     // Otherwise in non-threadgroup split mode all waves of a work-group are on
     // the same CU, so no need to wait for global memory as all waves in the
     // work-group access the same the L1, nor wait for GDS as access are ordered
     // on a CU.
     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
         (Scope == SIAtomicScope::WORKGROUP)) {
       // Same as GFX7 using agent scope.
       Scope = SIAtomicScope::AGENT;
     }
     // In threadgroup split mode LDS cannot be allocated so no need to wait for
     // LDS memory operations.
     AddrSpace &= ~SIAtomicAddrSpace::LDS;
   }
   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
                                         IsCrossAddrSpaceOrdering, Pos, Order);
 }

 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace,
                                          Position Pos) const {
   if (!InsertCacheInv)
     return false;

   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Ensures that following loads will not see stale remote VMEM data or
       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
       // CC will never be stale due to the local memory probes.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
       // hardware does not reorder memory operations by the same wave with
       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
       // remove any cache lines of earlier writes by the same wave and ensures
       // later reads by the same wave will refetch the cache lines.
       Changed = true;
       break;
     case SIAtomicScope::AGENT:
       // Same as GFX7.
       break;
     case SIAtomicScope::WORKGROUP:
       // In threadgroup split mode the waves of a work-group can be executing on
       // different CUs. Therefore need to invalidate the L1 which is per CU.
       // Otherwise in non-threadgroup split mode all waves of a work-group are
       // on the same CU, and so the L1 does not need to be invalidated.
       if (ST.isTgSplitEnabled()) {
         // Same as GFX7 using agent scope.
         Scope = SIAtomicScope::AGENT;
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Same as GFX7.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   if (Pos == Position::AFTER)
     --MI;

   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);

   return Changed;
 }

 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace,
                                          bool IsCrossAddrSpaceOrdering,
                                          Position Pos) const {
   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   const DebugLoc &DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
       // hardware does not reorder memory operations by the same wave with
       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
       // to initiate writeback of any dirty cache lines of earlier writes by the
       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
       // writeback has completed.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
         // Set SC bits to indicate system scope.
         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
       // vmcnt(0)" needed by the "BUFFER_WBL2".
       Changed = true;
       break;
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Same as GFX7.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if (Pos == Position::AFTER)
     --MI;

   Changed |=
       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
                                         IsCrossAddrSpaceOrdering, Pos);

   return Changed;
 }

 bool SIGfx940CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Set SC bits to indicate system scope.
       Changed |= enableSC0Bit(MI);
       Changed |= enableSC1Bit(MI);
       break;
     case SIAtomicScope::AGENT:
       // Set SC bits to indicate agent scope.
       Changed |= enableSC1Bit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
       // In threadgroup split mode the waves of a work-group can be executing on
       // different CUs. Therefore need to bypass the L1 which is per CU.
       // Otherwise in non-threadgroup split mode all waves of a work-group are
       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
       // bits to indicate work-group scope will do this automatically.
       Changed |= enableSC0Bit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Leave SC bits unset to indicate wavefront scope.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx940CacheControl::enableStoreCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
   assert(!MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Set SC bits to indicate system scope.
       Changed |= enableSC0Bit(MI);
       Changed |= enableSC1Bit(MI);
       break;
     case SIAtomicScope::AGENT:
       // Set SC bits to indicate agent scope.
       Changed |= enableSC1Bit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
       // Set SC bits to indicate workgroup scope.
       Changed |= enableSC0Bit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Leave SC bits unset to indicate wavefront scope.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx940CacheControl::enableRMWCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Set SC1 bit to indicate system scope.
       Changed |= enableSC1Bit(MI);
       break;
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
       // to indicate system or agent scope. The SC0 bit is used to indicate if
       // they are return or no-return. Leave SC1 bit unset to indicate agent
       // scope.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   return Changed;
 }

 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsVolatile) {
     // Set SC bits to indicate system scope.
     Changed |= enableSC0Bit(MI);
     Changed |= enableSC1Bit(MI);

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);

     return Changed;
   }

   if (IsNonTemporal) {
     Changed |= enableNTBit(MI);
     return Changed;
   }

   return Changed;
 }

 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace,
                                          Position Pos) const {
   if (!InsertCacheInv)
     return false;

   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Ensures that following loads will not see stale remote VMEM data or
       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
       // CC will never be stale due to the local memory probes.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
           // Set SC bits to indicate system scope.
           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
       // hardware does not reorder memory operations by the same wave with
       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
       // remove any cache lines of earlier writes by the same wave and ensures
       // later reads by the same wave will refetch the cache lines.
       Changed = true;
       break;
     case SIAtomicScope::AGENT:
       // Ensures that following loads will not see stale remote date or local
       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
       // due to the memory probes.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
           // Set SC bits to indicate agent scope.
           .addImm(AMDGPU::CPol::SC1);
       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
       // does not reorder memory operations with respect to preceeding buffer
       // invalidate. The invalidate is guaranteed to remove any cache lines of
       // earlier writes and ensures later writes will refetch the cache lines.
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
       // In threadgroup split mode the waves of a work-group can be executing on
       // different CUs. Therefore need to invalidate the L1 which is per CU.
       // Otherwise in non-threadgroup split mode all waves of a work-group are
       // on the same CU, and so the L1 does not need to be invalidated.
       if (ST.isTgSplitEnabled()) {
         // Ensures L1 is invalidated if in threadgroup split mode. In
         // non-threadgroup split mode it is a NOP, but no point generating it in
         // that case if know not in that mode.
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
             // Set SC bits to indicate work-group scope.
             .addImm(AMDGPU::CPol::SC0);
         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
         // does not reorder memory operations with respect to preceeding buffer
         // invalidate. The invalidate is guaranteed to remove any cache lines of
         // earlier writes and ensures later writes will refetch the cache lines.
         Changed = true;
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Could generate "BUFFER_INV" but it would do nothing as there are no
       // caches to invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace,
                                          bool IsCrossAddrSpaceOrdering,
                                          Position Pos) const {
   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
       // hardware does not reorder memory operations by the same wave with
       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
       // to initiate writeback of any dirty cache lines of earlier writes by the
       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
       // writeback has completed.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
           // Set SC bits to indicate system scope.
           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
       // SIAtomicScope::SYSTEM, the following insertWait will generate the
       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
       Changed = true;
       break;
     case SIAtomicScope::AGENT:
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
           // Set SC bits to indicate agent scope.
           .addImm(AMDGPU::CPol::SC1);

       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
       // SIAtomicScope::AGENT, the following insertWait will generate the
       // required "S_WAITCNT vmcnt(0)".
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // Do not generate "BUFFER_WBL2" as there are no caches it would
       // writeback, and would require an otherwise unnecessary
       // "S_WAITCNT vmcnt(0)".
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if (Pos == Position::AFTER)
     --MI;

   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
   // S_WAITCNT needed.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
                         IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);

   return Changed;
 }

 bool SIGfx10CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // Set the L0 and L1 cache policies to MISS_EVICT.
       // Note: there is no L2 cache coherent bypass control at the ISA level.
       Changed |= enableGLCBit(MI);
       Changed |= enableDLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
       if (!ST.isCuModeEnabled())
         Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsVolatile) {
     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD) {
       Changed |= enableGLCBit(MI);
       Changed |= enableDLCBit(MI);
     }

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);
     return Changed;
   }

   if (IsNonTemporal) {
     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
     // and L2 cache policy to STREAM.
     // For stores setting both GLC and SLC configures L0 and L1 cache policy
     // to MISS_EVICT and the L2 cache policy to STREAM.
     if (Op == SIMemOp::STORE)
       Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);

     return Changed;
   }

   return Changed;
 }

 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   bool VMCnt = false;
   bool VSCnt = false;
   bool LGKMCnt = false;

   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
       SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
         VMCnt |= true;
       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
         VSCnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
       // the WGP. Therefore need to wait for operations to complete to ensure
       // they are visible to waves in the other CU as the L0 is per CU.
       // Otherwise in CU mode and all waves of a work-group are on the same CU
       // which shares the same L0.
       if (!ST.isCuModeEnabled()) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           VMCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
           VSCnt |= true;
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The L0 cache keeps all memory operations in order for
       // work-items in the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
       // not needed as LDS operations for all waves are executed in a total
       // global ordering as observed by all waves. Required if also
       // synchronizing with global/GDS memory as LDS operations could be
       // reordered with respect to later global/GDS memory operations of the
       // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
       // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
       // is not needed as GDS operations for all waves are executed in a total
       // global ordering as observed by all waves. Required if also
       // synchronizing with global/LDS memory as GDS operations could be
       // reordered with respect to later global/LDS memory operations of the
       // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The GDS keeps all memory operations in order for
       // the same work-group.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if (VMCnt || LGKMCnt) {
     unsigned WaitCntImmediate =
       AMDGPU::encodeWaitcnt(IV,
                             VMCnt ? 0 : getVmcntBitMask(IV),
                             getExpcntBitMask(IV),
                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
         .addImm(WaitCntImmediate);
     Changed = true;
   }

   // On architectures that support direct loads to LDS, emit an unknown waitcnt
   // at workgroup-scoped release operations that specify the LDS address space.
   // SIInsertWaitcnts will later replace this with a vmcnt().
   if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
       Scope == SIAtomicScope::WORKGROUP &&
       (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
     Changed = true;
   }

   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
         .addImm(0);
     Changed = true;
   }

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                         SIAtomicScope Scope,
                                         SIAtomicAddrSpace AddrSpace,
                                         Position Pos) const {
   if (!InsertCacheInv)
     return false;

   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // The order of invalidates matter here. We must invalidate "outer in"
       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
       // invalidated.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
       // in CU mode and all waves of a work-group are on the same CU, and so the
       // L0 does not need to be invalidated.
       if (!ST.isCuModeEnabled()) {
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
         Changed = true;
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx10CacheControl::insertBarrierStart(
     MachineBasicBlock::iterator &MI) const {
   // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
   // mode. This is because a CU mode release fence does not emit any wait, which
   // is fine when only dealing with vmem, but isn't sufficient in the presence
   // of barriers which do not go through vmem.
   // GFX12.5 does not require this additional wait.
   if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
     return false;

   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
   return true;
 }

 bool SIGfx11CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
       // Set the L0 and L1 cache policies to MISS_EVICT.
       // Note: there is no L2 cache coherent bypass control at the ISA level.
       Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
       if (!ST.isCuModeEnabled())
         Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   /// The scratch address space does not need the global memory caches
   /// to be bypassed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.

   return Changed;
 }

 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsVolatile) {
     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);

     // Set MALL NOALLOC for load and store instructions.
     Changed |= enableDLCBit(MI);

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);
     return Changed;
   }

   if (IsNonTemporal) {
     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
     // and L2 cache policy to STREAM.
     // For stores setting both GLC and SLC configures L0 and L1 cache policy
     // to MISS_EVICT and the L2 cache policy to STREAM.
     if (Op == SIMemOp::STORE)
       Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);

     // Set MALL NOALLOC for load and store instructions.
     Changed |= enableDLCBit(MI);
     return Changed;
   }

   return Changed;
 }

 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
                                 AMDGPU::CPol::CPol Value) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
   if (!CPol)
     return false;

   uint64_t NewTH = Value & AMDGPU::CPol::TH;
   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
     return true;
   }

   return false;
 }

 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
                                    AMDGPU::CPol::CPol Value) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
   if (!CPol)
     return false;

   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
     return true;
   }

   return false;
 }

 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
     const MachineBasicBlock::iterator MI) const {
   // TODO: implement flag for frontend to give us a hint not to insert waits.

   MachineBasicBlock &MBB = *MI->getParent();
   const DebugLoc &DL = MI->getDebugLoc();

   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
   if (ST.hasImageInsts()) {
     BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
     BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
   }
   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);

   return true;
 }

 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
   bool Changed = false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   bool LOADCnt = false;
   bool DSCnt = false;
   bool STORECnt = false;

   if (Pos == Position::AFTER)
     ++MI;

   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
       SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::CLUSTER:
       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
         LOADCnt |= true;
       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
         STORECnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
       // GFX12.0:
       //   In WGP mode the waves of a work-group can be executing on either CU
       //   of the WGP. Therefore need to wait for operations to complete to
       //   ensure they are visible to waves in the other CU as the L0 is per CU.
       //   Otherwise in CU mode and all waves of a work-group are on the same CU
       //   which shares the same L0.
       //
       // GFX12.5:
       //   TODO DOCS
       if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
           STORECnt |= true;
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The L0 cache keeps all memory operations in order for
       // work-items in the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::CLUSTER:
     case SIAtomicScope::WORKGROUP:
       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
       // not needed as LDS operations for all waves are executed in a total
       // global ordering as observed by all waves. Required if also
       // synchronizing with global/GDS memory as LDS operations could be
       // reordered with respect to later global/GDS memory operations of the
       // same wave.
       DSCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
       // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   if (LOADCnt) {
     // Acquire sequences only need to wait on the previous atomic operation.
     // e.g. a typical sequence looks like
     //    atomic load
     //    (wait)
     //    global_inv
     //
     // We do not have BVH or SAMPLE atomics, so the atomic load is always going
     // to be tracked using loadcnt.
     //
     // This also applies to fences. Fences cannot pair with an instruction
     // tracked with bvh/samplecnt as we don't have any atomics that do that.
     if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
     Changed = true;
   }

   if (STORECnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
     Changed = true;
   }

   if (DSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
     Changed = true;
   }

   if (Pos == Position::AFTER)
     --MI;

   return Changed;
 }

 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                         SIAtomicScope Scope,
                                         SIAtomicAddrSpace AddrSpace,
                                         Position Pos) const {
   if (!InsertCacheInv)
     return false;

   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
   /// memory.

   /// Other address spaces do not have a cache.
   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
     return false;

   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
   switch (Scope) {
   case SIAtomicScope::SYSTEM:
     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
     break;
   case SIAtomicScope::AGENT:
     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
     break;
   case SIAtomicScope::CLUSTER:
     ScopeImm = AMDGPU::CPol::SCOPE_SE;
     break;
   case SIAtomicScope::WORKGROUP:
     // GFX12.0:
     //  In WGP mode the waves of a work-group can be executing on either CU of
     //  the WGP. Therefore we need to invalidate the L0 which is per CU.
     //  Otherwise in CU mode all waves of a work-group are on the same CU, and
     //  so the L0 does not need to be invalidated.
     //
     // GFX12.5
     //   TODO DOCS
     if (ST.isCuModeEnabled())
       return false;

     ScopeImm = AMDGPU::CPol::SCOPE_SE;
     break;
   case SIAtomicScope::WAVEFRONT:
   case SIAtomicScope::SINGLETHREAD:
     // No cache to invalidate.
     return false;
   default:
     llvm_unreachable("Unsupported synchronization scope");
   }

   if (Pos == Position::AFTER)
     ++MI;

   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);

   if (Pos == Position::AFTER)
     --MI;

   return true;
 }

 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                         SIAtomicScope Scope,
                                         SIAtomicAddrSpace AddrSpace,
                                         bool IsCrossAddrSpaceOrdering,
                                         Position Pos) const {
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();

   // The scratch address space does not need the global memory cache
   // writeback as all memory operations by the same thread are
   // sequentially consistent, and no other thread can access scratch
   // memory.

   // Other address spaces do not have a cache.
   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
     return false;

   if (Pos == Position::AFTER)
     ++MI;

   // global_wb is only necessary at system scope for GFX12.0,
   // they're also necessary at device scope for GFX12.5.
   //
   // Emitting it for lower scopes is a slow no-op, so we omit it
   // for performance.
   switch (Scope) {
   case SIAtomicScope::SYSTEM:
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
         .addImm(AMDGPU::CPol::SCOPE_SYS);
     break;
   case SIAtomicScope::AGENT:
     // TODO DOCS
     if (ST.hasGFX1250Insts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
           .addImm(AMDGPU::CPol::SCOPE_DEV);
     }
     break;
   case SIAtomicScope::CLUSTER:
   case SIAtomicScope::WORKGROUP:
     // No WB necessary, but we still have to wait.
     break;
   case SIAtomicScope::WAVEFRONT:
   case SIAtomicScope::SINGLETHREAD:
     // No WB or wait necessary here.
     return false;
   default:
     llvm_unreachable("Unsupported synchronization scope");
   }

   if (Pos == Position::AFTER)
     --MI;

   // We always have to wait for previous memory operations (load/store) to
   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
   // we of course need to wait for that as well.
   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
              IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);

   return true;
 }

 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

   // Only handle load and store, not atomic read-modify-write instructions.
   assert(MI->mayLoad() ^ MI->mayStore());

   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

   bool Changed = false;

   if (IsLastUse) {
     // Set last-use hint.
     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
   } else if (IsNonTemporal) {
     // Set non-temporal hint for all cache levels.
     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
   }

   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);

     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
                           Position::AFTER, AtomicOrdering::Unordered);
   }

   return Changed;
 }

 bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   assert(MI.mayStore() && "Not a Store inst");
   const bool IsRMW = (MI.mayLoad() && MI.mayStore());
   bool Changed = false;

   // GFX12.5 only: xcnt wait is needed before flat and global atomics
   // stores/rmw.
   if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
     MachineBasicBlock &MBB = *MI.getParent();
     BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
     Changed = true;
   }

   // Remaining fixes do not apply to RMWs.
   if (IsRMW)
     return Changed;

   MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
   if (!CPol) // Some vmem operations do not have a scope and are not concerned.
     return Changed;
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;

   // GFX12.0 only: Extra waits needed before system scope stores.
   if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
     Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());

   return Changed;
 }

 bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
   if (!ST.hasGFX1250Insts())
     return false;

   // Cooperative atomics need to be SCOPE_DEV or higher.
   MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
   assert(CPol && "No CPol operand?");
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
   if (Scope < CPol::SCOPE_DEV)
     return setScope(MI, CPol::SCOPE_DEV);
   return false;
 }

 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace) const {
   bool Changed = false;

   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
       break;
     case SIAtomicScope::AGENT:
       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
       break;
     case SIAtomicScope::CLUSTER:
       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
       break;
     case SIAtomicScope::WORKGROUP:
       // In workgroup mode, SCOPE_SE is needed as waves can executes on
       // different CUs that access different L0s.
       if (!ST.isCuModeEnabled())
         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // No cache to bypass.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
     }
   }

   // The scratch address space does not need the global memory caches
   // to be bypassed as all memory operations by the same thread are
   // sequentially consistent, and no other thread can access scratch
   // memory.

   // Other address spaces do not have a cache.

   return Changed;
 }

 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;

   for (auto &MI : AtomicPseudoMIs)
     MI->eraseFromParent();

   AtomicPseudoMIs.clear();
   return true;
 }

 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
                                    MachineBasicBlock::iterator &MI) {
   assert(MI->mayLoad() && !MI->mayStore());

   bool Changed = false;

   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Monotonic ||
         Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::SequentiallyConsistent) {
       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
                                            MOI.getOrderingAddrSpace());
     }

     // Handle cooperative atomics after cache bypass step, as it may override
     // the scope of the instruction to a greater scope.
     if (MOI.isCooperative())
       Changed |= CC->handleCooperativeAtomic(*MI);

     if (Order == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::BEFORE, Order);

     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::SequentiallyConsistent) {
       Changed |= CC->insertWait(
           MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
     }

     return Changed;
   }

   // Atomic instructions already bypass caches to the scope specified by the
   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
   // instructions need additional treatment.
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
       MOI.isNonTemporal(), MOI.isLastUse());

   return Changed;
 }

 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
                                     MachineBasicBlock::iterator &MI) {
   assert(!MI->mayLoad() && MI->mayStore());

   bool Changed = false;
   // FIXME: Necessary hack because iterator can lose track of the store.
   MachineInstr &StoreMI = *MI;

   if (MOI.isAtomic()) {
     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
         MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
                                             MOI.getOrderingAddrSpace());
     }

     // Handle cooperative atomics after cache bypass step, as it may override
     // the scope of the instruction to a greater scope.
     if (MOI.isCooperative())
       Changed |= CC->handleCooperativeAtomic(*MI);

     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertRelease(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);

     Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
     return Changed;
   }

   // Atomic instructions already bypass caches to the scope specified by the
   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
   // need additional treatment.
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
       MOI.isNonTemporal());

   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
   // instruction field, do not confuse it with atomic scope.
   Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
   return Changed;
 }

 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);

   AtomicPseudoMIs.push_back(MI);
   bool Changed = false;

   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();

   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
       Changed |= CC->insertWait(
           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
     }

     if (Order == AtomicOrdering::Release ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
       /// TODO: This relies on a barrier always generating a waitcnt
       /// for LDS to ensure it is not reordered with the completion of
       /// the proceeding LDS operations. If barrier had a memory
       /// ordering and memory scope, then library does not need to
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
       /// adding S_WAITCNT before a S_BARRIER.
       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);

     // TODO: If both release and invalidate are happening they could be combined
     // to use the single "BUFFER_WBINV*" instruction. This could be done by
     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
     // track cache invalidate and write back instructions.

     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
                                    Position::BEFORE);

     return Changed;
   }

   return Changed;
 }

 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   MachineBasicBlock::iterator &MI) {
   assert(MI->mayLoad() && MI->mayStore());

   bool Changed = false;
   MachineInstr &RMWMI = *MI;

   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Monotonic ||
         Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent) {
       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
                                           MOI.getInstrAddrSpace());
     }

     if (Order == AtomicOrdering::Release ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent ||
         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertRelease(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);

     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent ||
         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
       Changed |= CC->insertWait(
           MI, MOI.getScope(), MOI.getInstrAddrSpace(),
           isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
     }

     Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
     return Changed;
   }

   return Changed;
 }

 bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
   const MachineModuleInfo &MMI =
       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   return SIMemoryLegalizer(MMI).run(MF);
 }

 PreservedAnalyses
 SIMemoryLegalizerPass::run(MachineFunction &MF,
                            MachineFunctionAnalysisManager &MFAM) {
   auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
                   .getCachedResult<MachineModuleAnalysis>(
                       *MF.getFunction().getParent());
   assert(MMI && "MachineModuleAnalysis must be available");
   if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
     return PreservedAnalyses::all();
   return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
 }

 bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;

   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
   CC = SICacheControl::create(ST);

   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {

       // Unbundle instructions after the post-RA scheduler.
       if (MI->isBundle() && MI->mayLoadOrStore()) {
         MachineBasicBlock::instr_iterator II(MI->getIterator());
         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
              I != E && I->isBundledWithPred(); ++I) {
           I->unbundleFromPred();
           for (MachineOperand &MO : I->operands())
             if (MO.isReg())
               MO.setIsInternalRead(false);
         }

         MI->eraseFromParent();
         MI = II->getIterator();
       }

       if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
         Changed |= CC->insertBarrierStart(MI);
         continue;
       }

       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;

       if (const auto &MOI = MOA.getLoadInfo(MI))
         Changed |= expandLoad(*MOI, MI);
       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
         Changed |= expandStore(*MOI, MI);
       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
         Changed |= expandAtomicFence(*MOI, MI);
       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
     }
   }

   Changed |= removeAtomicPseudoMIs();
   return Changed;
 }

 INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)

 char SIMemoryLegalizerLegacy::ID = 0;
 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;

 FunctionPass *llvm::createSIMemoryLegalizerPass() {
   return new SIMemoryLegalizerLegacy();
 }