[AMDGPU] Add two TSFlags: IsAtomicNoRtn and IsAtomicRtn

We are using AtomicNoRet map in multiple places to determine
if an instruction atomic, rtn or nortn atomic. This method
does not work always since we have some instructions which
only has rtn or nortn version.

One such instruction is ds_wrxchg_rtn_b32 which does not have
nortn version. This has caused changes in memory legalizer
tests.

Differential Revision: https://reviews.llvm.org/D96639

GitOrigin-RevId: 5cf9292ce341d2002f5d6e0189d54e29f9e71afe
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 5dc5481..4a67630 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -696,6 +696,7 @@
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let glc_value = 0;
   let dlc_value = 0;
+  let IsAtomicNoRet = 1;
   let AsmMatchConverter = "cvtMubufAtomic";
 }
 
@@ -714,6 +715,7 @@
   let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
   let glc_value = 1;
   let dlc_value = 0;
+  let IsAtomicRet = 1;
   let Constraints = "$vdata = $vdata_in";
   let DisableEncoding = "$vdata_in";
   let AsmMatchConverter = "cvtMubufAtomicReturn";
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index 328c810..9f71017 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -102,6 +102,7 @@
 
   let has_data1 = 0;
   let has_vdst = 0;
+  let IsAtomicNoRet = 1;
 }
 
 multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -121,6 +122,7 @@
   " $addr, $data0, $data1$offset$gds"> {
 
   let has_vdst = 0;
+  let IsAtomicNoRet = 1;
 }
 
 multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -161,6 +163,7 @@
 
   let hasPostISelHook = 1;
   let has_data1 = 0;
+  let IsAtomicRet = 1;
 }
 
 multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
@@ -184,6 +187,7 @@
   " $vdst, $addr, $data0, $data1$offset$gds"> {
 
   let hasPostISelHook = 1;
+  let IsAtomicRet = 1;
 }
 
 multiclass DS_1A2D_RET_mc<string opName,
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 57a355a..7a9c36e 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -345,6 +345,7 @@
     let dlcValue = 0;
     let has_vdst = 0;
     let maybeAtomic = 1;
+    let IsAtomicNoRet = 1;
 }
 
 class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
@@ -354,6 +355,8 @@
   let has_vdst = 1;
   let glcValue = 1;
   let dlcValue = 0;
+  let IsAtomicNoRet = 0;
+  let IsAtomicRet = 1;
   let PseudoInstr = NAME # "_RTN";
 }
 
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index ce8ecd1..6769b10 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -39,6 +39,7 @@
   bit Coordinates = 1;
   bit LodOrClampOrMip = 0;
   bit HasD16 = 0;
+  bit IsAtomicRet = 0;
 }
 
 def MIMGBaseOpcode : GenericEnum {
@@ -556,20 +557,22 @@
 }
 
 multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics
-  def "" : MIMGBaseOpcode {
-    let Atomic = 1;
-    let AtomicX2 = isCmpSwap;
-  }
+  let IsAtomicRet = 1 in {
+    def "" : MIMGBaseOpcode {
+      let Atomic = 1;
+      let AtomicX2 = isCmpSwap;
+    }
 
-  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
-    // _V* variants have different dst size, but the size is encoded implicitly,
-    // using dmask and tfe. Only 32-bit variant is registered with disassembler.
-    // Other variants are reconstructed by disassembler using dmask and tfe.
-    let VDataDwords = !if(isCmpSwap, 2, 1) in
-    defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>;
-    let VDataDwords = !if(isCmpSwap, 4, 2) in
-    defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>;
-  }
+    let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+      // _V* variants have different dst size, but the size is encoded implicitly,
+      // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+      // Other variants are reconstructed by disassembler using dmask and tfe.
+      let VDataDwords = !if(isCmpSwap, 2, 1) in
+      defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>;
+      let VDataDwords = !if(isCmpSwap, 4, 2) in
+      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>;
+    }
+  } // End IsAtomicRet = 1
 }
 
 class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index c83802b..9b4e104 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -106,7 +106,13 @@
   IsDOT = UINT64_C(1) << 55,
 
   // FLAT instruction accesses FLAT_SCRATCH segment.
-  IsFlatScratch = UINT64_C(1) << 56
+  IsFlatScratch = UINT64_C(1) << 56,
+
+  // Atomic without return.
+  IsAtomicNoRet = UINT64_C(1) << 57,
+
+  // Atomic with return.
+  IsAtomicRet = UINT64_C(1) << 58
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 65e833e..b0ec5ac 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -112,8 +112,7 @@
     return false;
   if (!MI.mayLoad() || MI.mayStore())
     return false;
-  if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
-      AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+  if (SIInstrInfo::isAtomic(MI))
     return false;
   if (IsVMEMClause && !isVMEMClauseInst(MI))
     return false;
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c127455..c5110f8 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -538,7 +538,7 @@
                                                  AMDGPU::OpName::data1),
                       CurrScore);
         }
-      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+      } else if (SIInstrInfo::isAtomicRet(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
@@ -560,7 +560,7 @@
             &Inst, TII, TRI, MRI,
             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
             CurrScore);
-      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setExpScore(
             &Inst, TII, TRI, MRI,
             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -569,7 +569,7 @@
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
-      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setExpScore(
             &Inst, TII, TRI, MRI,
             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -582,7 +582,7 @@
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
-      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setExpScore(
             &Inst, TII, TRI, MRI,
             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -1246,8 +1246,7 @@
       ++FlatASCount;
       if (!ST->hasVscnt())
         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-      else if (Inst.mayLoad() &&
-               AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+      else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
       else
         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
@@ -1275,8 +1274,7 @@
              Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
     if (!ST->hasVscnt())
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    else if ((Inst.mayLoad() &&
-              AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+    else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
              /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
              (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
@@ -1284,7 +1282,7 @@
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
 
     if (ST->vmemWriteNeedsExpWaitcnt() &&
-        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
+        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
   } else if (TII->isSMRD(Inst)) {
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 7ce042b..92d148c 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -135,6 +135,12 @@
   // Must be 0 for non-FLAT instructions.
   field bit IsFlatScratch = 0;
 
+  // Atomic without a return.
+  field bit IsAtomicNoRet = 0;
+
+  // Atomic with return.
+  field bit IsAtomicRet = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -205,6 +211,10 @@
 
   let TSFlags{56} = IsFlatScratch;
 
+  let TSFlags{57} = IsAtomicNoRet;
+
+  let TSFlags{58} = IsAtomicRet;
+
   let SchedRW = [Write32Bit];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index ce59fe8..9bcda2e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -538,6 +538,32 @@
     return get(Opcode).TSFlags & SIInstrFlags::EXP;
   }
 
+  static bool isAtomicNoRet(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
+  }
+
+  bool isAtomicNoRet(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
+  }
+
+  static bool isAtomicRet(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
+  }
+
+  bool isAtomicRet(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
+  }
+
+  static bool isAtomic(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & (SIInstrFlags::IsAtomicRet |
+                                   SIInstrFlags::IsAtomicNoRet);
+  }
+
+  bool isAtomic(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
+                                  SIInstrFlags::IsAtomicNoRet);
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
@@ -1166,9 +1192,6 @@
   int getMUBUFNoLdsInst(uint16_t Opcode);
 
   LLVM_READONLY
-  int getAtomicRetOp(uint16_t Opcode);
-
-  LLVM_READONLY
   int getAtomicNoRetOp(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 5adc9e8..8b557cc 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2408,15 +2408,6 @@
   let ValueCols = [["0"]];
 }
 
-// Maps an atomic opcode to its version with a return value.
-def getAtomicRetOp : InstrMapping {
-  let FilterClass = "AtomicNoRet";
-  let RowFields = ["NoRetOp"];
-  let ColFields = ["IsRet"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
 // Maps an atomic opcode to its returnless version.
 def getAtomicNoRetOp : InstrMapping {
   let FilterClass = "AtomicNoRet";
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 40173bc..a144fc6 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -455,7 +455,7 @@
   /// Return true iff instruction \p MI is a atomic instruction that
   /// returns a result.
   bool isAtomicRet(const MachineInstr &MI) const {
-    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+    return SIInstrInfo::isAtomicRet(MI);
   }
 
   /// Removes all processed atomic pseudo instructions from the current
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index e928b4f..37f2b66 100644
--- a/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -595,8 +595,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -722,8 +721,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -790,8 +788,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -856,8 +853,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -928,8 +924,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -1002,8 +997,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 5a9bb11..86e73d9e 100644
--- a/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -595,8 +595,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -722,8 +721,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -790,8 +788,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -856,8 +853,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -928,8 +924,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -1002,8 +997,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index deca37d..af87460 100644
--- a/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -595,8 +595,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -722,8 +721,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -790,8 +788,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -856,8 +853,7 @@
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -928,8 +924,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -1002,8 +997,7 @@
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm