[AMDGPU] Fix DPP combiner

Differential revision: https://reviews.llvm.org/D55444

dpp move with uses and old reg initializer should be in the same BB.
bound_ctrl:0 is only considered when bank_mask and row_mask are fully enabled (0xF). Otherwise the old register value is checked for identity.
Added add, subrev, and, or instructions to the old folding function.
Kill flag is cleared for the src0 (DPP register) as it may be copied into more than one user.

The pass is still disabled by default.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353513 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h
index e6418fb..b732be6 100644
--- a/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/include/llvm/CodeGen/TargetInstrInfo.h
@@ -428,6 +428,13 @@
 
     RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
         : Reg(Reg), SubReg(SubReg) {}
+
+    bool operator==(const RegSubRegPair& P) const {
+      return Reg == P.Reg && SubReg == P.SubReg;
+    }
+    bool operator!=(const RegSubRegPair& P) const {
+      return !(*this == P);
+    }
   };
 
   /// A pair composed of a pair of a register and a sub-register index,
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a77bb8a..430cf48 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -6,31 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
-// operand.If any of the use instruction cannot be combined with the mov the
+// operand. If any of the use instruction cannot be combined with the mov the
 // whole sequence is reverted.
 //
 // $old = ...
 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
-//                            dpp_controls..., $bound_ctrl
-// $res = VALU $dpp_value, ...
+//                            dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
+// $res = VALU $dpp_value [, src1]
 //
 // to
 //
-// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
-//                 dpp_controls..., $folded_bound_ctrl
+// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
+//                 dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
 //
 // Combining rules :
 //
-// $bound_ctrl is DPP_BOUND_ZERO, $old is any
-// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+// if $row_mask and $bank_mask are fully enabled (0xF) and
+//    $bound_ctrl==DPP_BOUND_ZERO or $old==0
+// -> $combined_old = undef,
+//    $combined_bound_ctrl = DPP_BOUND_ZERO
 //
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
-// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+// if the VALU op is binary and
+//    $bound_ctrl==DPP_BOUND_OFF and
+//    $old==identity value (immediate) for the VALU op
+// -> $combined_old = src1,
+//    $combined_bound_ctrl = DPP_BOUND_OFF
 //
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
-// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+// Othervise cancel.
 //
-// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+// The mov_dpp instruction should recide in the same BB as all it's uses
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -66,20 +70,16 @@
 
   MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
 
-  RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
-                            RegSubRegPair OldOpndVGPR,
-                            MachineOperand &OldOpndValue) const;
-
   MachineInstr *createDPPInst(MachineInstr &OrigMI,
                               MachineInstr &MovMI,
-                              RegSubRegPair OldOpndVGPR,
+                              RegSubRegPair CombOldVGPR,
                               MachineOperand *OldOpnd,
-                              bool BoundCtrlZero) const;
+                              bool CombBCZ) const;
 
   MachineInstr *createDPPInst(MachineInstr &OrigMI,
                               MachineInstr &MovMI,
-                              RegSubRegPair OldOpndVGPR,
-                              bool BoundCtrlZero) const;
+                              RegSubRegPair CombOldVGPR,
+                              bool CombBCZ) const;
 
   bool hasNoImmOrEqual(MachineInstr &MI,
                        unsigned OpndName,
@@ -152,8 +152,8 @@
 
 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            MachineInstr &MovMI,
-                                           RegSubRegPair OldOpndVGPR,
-                                           bool BoundCtrlZero) const {
+                                           RegSubRegPair CombOldVGPR,
+                                           bool CombBCZ) const {
   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
   assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
          TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
@@ -177,9 +177,15 @@
     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
     if (OldIdx != -1) {
       assert(OldIdx == NumOperands);
-      assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
-      DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+      assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+      DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
       ++NumOperands;
+    } else {
+      // TODO: this discards MAC/FMA instructions for now, let's add it later
+      LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
+                           " TBD\n");
+      Fail = true;
+      break;
     }
 
     if (auto *Mod0 = TII->getNamedOperand(OrigMI,
@@ -198,6 +204,7 @@
       break;
     }
     DPPInst.add(*Src0);
+    DPPInst->getOperand(NumOperands).setIsKill(false);
     ++NumOperands;
 
     if (auto *Mod1 = TII->getNamedOperand(OrigMI,
@@ -230,7 +237,7 @@
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
-    DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+    DPPInst.addImm(CombBCZ ? 1 : 0);
   } while (false);
 
   if (Fail) {
@@ -241,64 +248,68 @@
   return DPPInst.getInstr();
 }
 
-GCNDPPCombine::RegSubRegPair
-GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
-                           RegSubRegPair OldOpndVGPR,
-                           MachineOperand &OldOpndValue) const {
-  assert(OldOpndValue.isImm());
-  switch (OrigMI.getOpcode()) {
+static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
+  assert(OldOpnd->isImm());
+  switch (OrigMIOp) {
   default: break;
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_I32_e32:
+  case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_SUBREV_U32_e32:
+  case AMDGPU::V_SUBREV_I32_e32:
   case AMDGPU::V_MAX_U32_e32:
-    if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
-      return OldOpndVGPR;
+  case AMDGPU::V_XOR_B32_e32:
+    if (OldOpnd->getImm() == 0)
+      return true;
     break;
-  case AMDGPU::V_MAX_I32_e32:
-    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
-      return OldOpndVGPR;
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_MIN_U32_e32:
+    if (static_cast<uint32_t>(OldOpnd->getImm()) ==
+        std::numeric_limits<uint32_t>::max())
+      return true;
     break;
   case AMDGPU::V_MIN_I32_e32:
-    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
-      return OldOpndVGPR;
+    if (static_cast<int32_t>(OldOpnd->getImm()) ==
+        std::numeric_limits<int32_t>::max())
+      return true;
     break;
-
+  case AMDGPU::V_MAX_I32_e32:
+    if (static_cast<int32_t>(OldOpnd->getImm()) ==
+        std::numeric_limits<int32_t>::min())
+      return true;
+    break;
   case AMDGPU::V_MUL_I32_I24_e32:
   case AMDGPU::V_MUL_U32_U24_e32:
-    if (OldOpndValue.getImm() == 1) {
-      auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
-      assert(Src1 && Src1->isReg());
-      return getRegSubRegPair(*Src1);
-    }
+    if (OldOpnd->getImm() == 1)
+      return true;
     break;
   }
-  return RegSubRegPair();
+  return false;
 }
 
-// Cases to combine:
-//  $bound_ctrl is DPP_BOUND_ZERO, $old is any
-//  $bound_ctrl is DPP_BOUND_OFF, $old is 0
-//  -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
-
-//  $bound_ctrl is DPP_BOUND_OFF, $old is undef
-//  -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
-
-//  $bound_ctrl is DPP_BOUND_OFF, $old is foldable
-//  -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
-
 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            MachineInstr &MovMI,
-                                           RegSubRegPair OldOpndVGPR,
+                                           RegSubRegPair CombOldVGPR,
                                            MachineOperand *OldOpndValue,
-                                           bool BoundCtrlZero) const {
-  assert(OldOpndVGPR.Reg);
-  if (!BoundCtrlZero && OldOpndValue) {
-    assert(OldOpndValue->isImm());
-    OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
-    if (!OldOpndVGPR.Reg) {
-      LLVM_DEBUG(dbgs() << "  failed: old immediate cannot be folded\n");
+                                           bool CombBCZ) const {
+  assert(CombOldVGPR.Reg);
+  if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
+    auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+    if (!Src1 || !Src1->isReg()) {
+      LLVM_DEBUG(dbgs() << "  failed: no src1 or it isn't a register\n");
+      return nullptr;
+    }
+    if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
+      LLVM_DEBUG(dbgs() << "  failed: old immediate ins't an identity\n");
+      return nullptr;
+    }
+    CombOldVGPR = getRegSubRegPair(*Src1);
+    if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
+      LLVM_DEBUG(dbgs() << "  failed: src1 isn't a VGPR32 register\n");
       return nullptr;
     }
   }
-  return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+  return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
 }
 
 // returns true if MI doesn't have OpndName immediate operand or the
@@ -315,31 +326,64 @@
 
 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+  auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+  assert(DstOpnd && DstOpnd->isReg());
+  auto DPPMovReg = DstOpnd->getReg();
+  if (!isEXECMaskConstantBetweenDefAndUses(DPPMovReg, *MRI)) {
+    LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
+                         " for all uses\n");
+    return false;
+  }
+
+  auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
+  assert(RowMaskOpnd && RowMaskOpnd->isImm());
+  auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
+  assert(BankMaskOpnd && BankMaskOpnd->isImm());
+  const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
+                            BankMaskOpnd->getImm() == 0xF;
+
   auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
   assert(BCZOpnd && BCZOpnd->isImm());
-  bool BoundCtrlZero = 0 != BCZOpnd->getImm();
-
-  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+  bool BoundCtrlZero = BCZOpnd->getImm();
 
   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
   assert(OldOpnd && OldOpnd->isReg());
-  auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
-  auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+
+  auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
+  // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
+  // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
+  // but the third option is used to distinguish undef from non-immediate
+  // to reuse IMPLICIT_DEF instruction later
   assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
-  if (OldOpndValue) {
-    if (BoundCtrlZero) {
-      OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
-      OldOpndValue = nullptr;
-    } else {
-      if (!OldOpndValue->isImm()) {
-        LLVM_DEBUG(dbgs() << "  failed: old operand isn't an imm or undef\n");
-        return false;
+
+  bool CombBCZ = false;
+
+  if (MaskAllLanes && BoundCtrlZero) { // [1]
+    CombBCZ = true;
+  } else {
+    if (!OldOpndValue || !OldOpndValue->isImm()) {
+      LLVM_DEBUG(dbgs() << "  failed: the DPP mov isn't combinable\n");
+      return false;
+    }
+
+    if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
+      LLVM_DEBUG(dbgs() <<
+        "  failed: old reg def and mov should be in the same BB\n");
+      return false;
+    }
+
+    if (OldOpndValue->getImm() == 0) {
+      if (MaskAllLanes) {
+        assert(!BoundCtrlZero); // by check [1]
+        CombBCZ = true;
       }
-      if (OldOpndValue->getImm() == 0) {
-        OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
-        OldOpndValue = nullptr;
-        BoundCtrlZero = true;
-      }
+    } else if (BoundCtrlZero) {
+      assert(!MaskAllLanes); // by check [1]
+      LLVM_DEBUG(dbgs() <<
+        "  failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
+      return false;
     }
   }
 
@@ -347,25 +391,28 @@
     if (!OldOpndValue)
       dbgs() << "undef";
     else
-      dbgs() << OldOpndValue->getImm();
-    dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
+      dbgs() << *OldOpndValue;
+    dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
 
-  std::vector<MachineInstr*> OrigMIs, DPPMIs;
-  if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
-    OldOpndVGPR = RegSubRegPair(
+  SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+  auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
+  // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
+  if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+    CombOldVGPR = RegSubRegPair(
       MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
-                             TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+                             TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
     DPPMIs.push_back(UndefInst.getInstr());
   }
 
   OrigMIs.push_back(&MovMI);
   bool Rollback = true;
-  for (auto &Use : MRI->use_nodbg_operands(
-       TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+  for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
     Rollback = true;
 
     auto &OrigMI = *Use.getParent();
+    LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
+
     auto OrigOp = OrigMI.getOpcode();
     if (TII->isVOP3(OrigOp)) {
       if (!TII->hasVALU32BitEncoding(OrigOp)) {
@@ -388,8 +435,8 @@
 
     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
     if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
-      if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
-                                        OldOpndValue, BoundCtrlZero)) {
+      if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
+                                        OldOpndValue, CombBCZ)) {
         DPPMIs.push_back(DPPInst);
         Rollback = false;
       }
@@ -400,8 +447,8 @@
       BB->insert(OrigMI, NewMI);
       if (TII->commuteInstruction(*NewMI)) {
         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
-        if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
-                                          OldOpndValue, BoundCtrlZero)) {
+        if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
+                                          OldOpndValue, CombBCZ)) {
           DPPMIs.push_back(DPPInst);
           Rollback = false;
         }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 06c080c..c6cd7a1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5646,3 +5646,29 @@
   }
   return nullptr;
 }
+
+bool llvm::isEXECMaskConstantBetweenDefAndUses(unsigned VReg,
+                                               MachineRegisterInfo &MRI) {
+  assert(MRI.isSSA() && "Must be run on SSA");
+  auto *TRI = MRI.getTargetRegisterInfo();
+
+  auto *DefI = MRI.getVRegDef(VReg);
+  auto *BB = DefI->getParent();
+
+  DenseSet<MachineInstr*> Uses;
+  for (auto &Use : MRI.use_nodbg_operands(VReg)) {
+    auto *I = Use.getParent();
+    if (I->getParent() != BB)
+      return false;
+    Uses.insert(I);
+  }
+
+  auto E = BB->end();
+  for (auto I = std::next(DefI->getIterator()); I != E; ++I) {
+    Uses.erase(&*I);
+    // don't check the last use
+    if (Uses.empty() || I->modifiesRegister(AMDGPU::EXEC, TRI))
+      break;
+  }
+  return Uses.empty();
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 2aa9a33..b96c40d 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -953,6 +953,12 @@
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
                                MachineRegisterInfo &MRI);
 
+/// \brief Return true if EXEC mask isnt' changed between the def and
+/// all uses of VReg. Currently if def and uses are in different BBs -
+/// simply return false. Should be run on SSA.
+bool isEXECMaskConstantBetweenDefAndUses(unsigned VReg,
+                                         MachineRegisterInfo &MRI);
+
 namespace AMDGPU {
 
   LLVM_READONLY
diff --git a/test/CodeGen/AMDGPU/dpp_combine.ll b/test/CodeGen/AMDGPU/dpp_combine.ll
deleted file mode 100644
index 36356a7..0000000
--- a/test/CodeGen/AMDGPU/dpp_combine.ll
+++ /dev/null
@@ -1,185 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s
-
-; VOP2 with literal cannot be combined
-; CHECK-LABEL: {{^}}dpp_combine_i32_literal:
-; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0
-; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]]
-define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) {
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0
-  %res = add nsw i32 %dpp, 42
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_bz:
-; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
-  %res = add nsw i32 %dpp, %x
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef:
-; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
-  %res = add nsw i32 %dpp, %x
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0:
-; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
-  %res = add nsw i32 %dpp, %x
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max:
-; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2
-; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
-  %cmp = icmp sge i32 %dpp, %x
-  %res = select i1 %cmp, i32 %dpp, i32 %x
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min:
-; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1
-; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
-  %cmp = icmp sle i32 %dpp, %x
-  %res = select i1 %cmp, i32 %dpp, i32 %x
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul:
-; CHECK: v_mul_i32_i24_dpp v0, v3, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
-
-  %dpp.shl = shl i32 %dpp, 8
-  %dpp.24 = ashr i32 %dpp.shl, 8
-  %x.shl = shl i32 %x, 8
-  %x.24 = ashr i32 %x.shl, 8
-  %res = mul i32 %dpp.24, %x.24
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_i32_commute:
-; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0
-  %res = sub nsw i32 %x, %dpp
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_f32:
-; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0  quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0
-  %dpp.f32 = bitcast i32 %dpp to float
-  %x.f32 = bitcast i32 %x to float
-  %res.f32 = fadd float %x.f32, %dpp.f32
-  %res = bitcast float %res.f32 to i32
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods:
-; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0  quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0
-
-  %x.f32 = bitcast i32 %x to float
-  %x.f32.neg = fsub float -0.000000e+00, %x.f32
-
-  %dpp.f32 = bitcast i32 %dpp to float
-  %dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00
-  %dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00
-  %dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign
-
-  %res.f32 = fmul float %x.f32.neg, %dpp.f32.abs
-  %res = bitcast float %res.f32 to i32
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_mac:
-; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
-  %dpp.f32 = bitcast i32 %dpp to float
-  %x.f32 = bitcast i32 %x to float
-  %y.f32 = bitcast i32 %y to float
-
-  %mult = fmul float %dpp.f32, %y.f32
-  %res = fadd float %mult, %x.f32
-  store float %res, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_sequence:
-define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
-  br i1 %cmp, label %bb1, label %bb2
-bb1:
-; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-  %resadd = add nsw i32 %dpp, %x
-  br label %bb3
-bb2:
-; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-  %ressub = sub nsw i32 %x, %dpp
-  br label %bb3
-bb3:
-  %res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}dpp_combine_sequence_negative:
-; CHECK: v_mov_b32_dpp v1, v1  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
-  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
-  br i1 %cmp, label %bb1, label %bb2
-bb1:
-  %resadd = add nsw i32 %dpp, %x
-  br label %bb3
-bb2:
-  %ressub = sub nsw i32 2, %dpp ; break seq
-  br label %bb3
-bb3:
-  %res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-declare i32 @llvm.amdgcn.workitem.id.y()
-declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
-
-attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/dpp_combine.mir b/test/CodeGen/AMDGPU/dpp_combine.mir
new file mode 100644
index 0000000..fbeadcf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -0,0 +1,472 @@
+# RUN: llc -march=amdgcn -mcpu=tonga  -run-pass=gcn-dpp-combine  -o - %s | FileCheck %s
+
+---
+# old is undefined: only combine when masks are fully enabled and
+# bound_ctrl:0 is set, otherwise the result of DPP VALU op can be undefined.
+# CHECK-LABEL: name: old_is_undef
+# CHECK: %2:vgpr_32 = IMPLICIT_DEF
+# VOP2:
+# CHECK: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec
+# CHECK: %6:vgpr_32 = V_ADD_U32_e32 %5, %1, implicit $exec
+# CHECK: %8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec
+# CHECK: %10:vgpr_32 = V_ADD_U32_e32 %9, %1, implicit $exec
+# VOP1:
+# CHECK: %12:vgpr_32 = V_NOT_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+# CHECK: %14:vgpr_32 = V_NOT_B32_e32 %13, implicit $exec
+# CHECK: %16:vgpr_32 = V_NOT_B32_e32 %15, implicit $exec
+# CHECK: %18:vgpr_32 = V_NOT_B32_e32 %17, implicit $exec
+name: old_is_undef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+
+    ; VOP2
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e32 %5, %1, implicit $exec
+
+    %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 1, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec
+
+    %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %9, %1, implicit $exec
+
+    ; VOP1
+    %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %12:vgpr_32 = V_NOT_B32_e32 %11, implicit $exec
+
+    %13:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 0, implicit $exec
+    %14:vgpr_32 = V_NOT_B32_e32 %13, implicit $exec
+
+    %15:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 1, implicit $exec
+    %16:vgpr_32 = V_NOT_B32_e32 %15, implicit $exec
+
+    %17:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %18:vgpr_32 = V_NOT_B32_e32 %17, implicit $exec
+...
+
+# old is zero cases:
+
+# CHECK-LABEL: name: old_is_0
+
+# VOP2:
+# case 1: old is zero, masks are fully enabled, bound_ctrl:0 is on:
+# the DPP mov result would be either zero ({src lane disabled}|{src lane is
+# out of range}) or active src lane result - can combine with old = undef.
+# undef is preffered as it makes life easier for the regalloc.
+# CHECK: [[U1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+# CHECK: %4:vgpr_32 = V_ADD_U32_dpp [[U1]], %0, %1, 1, 15, 15, 1, implicit $exec
+
+# case 2: old is zero, masks are fully enabled, bound_ctrl:0 is off:
+# as the DPP mov old is zero this case is no different from case 1 - combine it
+# setting bound_ctrl0 on for the combined DPP VALU op to make old undefined
+# CHECK: [[U2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+# CHECK: %6:vgpr_32 = V_ADD_U32_dpp [[U2]], %0, %1, 1, 15, 15, 1, implicit $exec
+
+# case 3: masks are partialy disabled, bound_ctrl:0 is on:
+# the DPP mov result would be either zero ({src lane disabled}|{src lane is
+# out of range} or {the DPP mov's dest VGPR write is disabled by masks}) or
+# active src lane result - can combine with old = src1 of the VALU op.
+# The VALU op should have the same masks as DPP mov as they select lanes
+# with identity value.
+# Special case: the bound_ctrl for the combined DPP VALU op isn't important
+# here but let's make it off to keep the combiner's logic simpler.
+# CHECK: %8:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+
+# case 4: masks are partialy disabled, bound_ctrl:0 is off:
+# the DPP mov result would be either zero ({src lane disabled}|{src lane is
+# out of range} or {the DPP mov's dest VGPR write is disabled by masks}) or
+# active src lane result - can combine with old = src1 of the VALU op.
+# The VALU op should have the same masks as DPP mov as they select
+# lanes with identity value
+# CHECK: %10:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+
+# VOP1:
+# see case 1
+# CHECK: [[U3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+# CHECK: %12:vgpr_32 = V_NOT_B32_dpp [[U3]], %0, 1, 15, 15, 1, implicit $exec
+# see case 2
+# CHECK: [[U4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+# CHECK: %14:vgpr_32 = V_NOT_B32_dpp [[U4]], %0, 1, 15, 15, 1, implicit $exec
+# case 3 and 4 not appliable as there is no way to specify unchanged result
+# for the unary VALU op
+# CHECK: %16:vgpr_32 = V_NOT_B32_e32 %15, implicit $exec
+# CHECK: %18:vgpr_32 = V_NOT_B32_e32 %17, implicit $exec
+
+name: old_is_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    ; VOP2
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e32 %5, %1, implicit $exec
+
+    %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 1, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec
+
+    %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %9, %1, implicit $exec
+
+    ; VOP1
+    %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %12:vgpr_32 = V_NOT_B32_e32 %11, implicit $exec
+
+    %13:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 0, implicit $exec
+    %14:vgpr_32 = V_NOT_B32_e32 %13, implicit $exec
+
+    %15:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 1, implicit $exec
+    %16:vgpr_32 = V_NOT_B32_e32 %15, implicit $exec
+
+    %17:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %18:vgpr_32 = V_NOT_B32_e32 %17, implicit $exec
+...
+
+# old is nonzero identity cases:
+
+# old is nonzero identity, masks are fully enabled, bound_ctrl:0 is off:
+# the DPP mov result would be either identity ({src lane disabled}|{out of
+# range}) or src lane result - can combine with old = src1 of the VALU op
+# The DPP VALU op should have the same masks (and bctrl) as DPP mov as they
+# select lanes with identity value
+
+# CHECK-LABEL: name: nonzero_old_is_identity_masks_enabled_bctl_off
+# CHECK: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 15, 15, 0, implicit $exec
+# CHECK: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 15, 0, implicit $exec
+# CHECK: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 15, 15, 0, implicit $exec
+# CHECK: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 15, 0, implicit $exec
+
+name: nonzero_old_is_identity_masks_enabled_bctl_off
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 0, implicit $exec
+    %4:vgpr_32 = V_MUL_U32_U24_e32 %3, %1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 15, 0, implicit $exec
+    %7:vgpr_32 = V_AND_B32_e32 %6, %1, implicit $exec
+
+    %8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 15, 15, 0, implicit $exec
+    %10:vgpr_32 = V_MAX_I32_e32 %9, %1, implicit $exec
+
+    %11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 15, 0, implicit $exec
+    %13:vgpr_32 = V_MIN_I32_e32 %12, %1, implicit $exec
+...
+
+# old is nonzero identity, masks are partially enabled, bound_ctrl:0 is off:
+# the DPP mov result would be either identity ({src lane disabled}|{src lane is
+# out of range} or {the DPP mov's dest VGPR write is disabled by masks}) or
+# active src lane result - can combine with old = src1 of the VALU op.
+# The DPP VALU op should have the same masks (and bctrl) as DPP mov as they
+# select lanes with identity value
+
+# CHECK-LABEL: name: nonzero_old_is_identity_masks_partially_disabled_bctl_off
+# CHECK: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+# CHECK: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
+# CHECK: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+# CHECK: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
+
+name: nonzero_old_is_identity_masks_partially_disabled_bctl_off
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %4:vgpr_32 = V_MUL_U32_U24_e32 %3, %1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 0, implicit $exec
+    %7:vgpr_32 = V_AND_B32_e32 %6, %1, implicit $exec
+
+    %8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
+    %10:vgpr_32 = V_MAX_I32_e32 %9, %1, implicit $exec
+
+    %11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 0, implicit $exec
+    %13:vgpr_32 = V_MIN_I32_e32 %12, %1, implicit $exec
+...
+
+# old is nonzero identity, masks are partially enabled, bound_ctrl:0 is on:
+# the DPP mov result may have 3 different values:
+#   1. the active src lane result
+#   2. 0 if the src lane is disabled|out of range
+#   3. DPP mov's old value if the mov's dest VGPR write is disabled by masks
+# can't combine
+
+# CHECK-LABEL: name: nonzero_old_is_identity_masks_partially_disabled_bctl0
+# CHECK: %4:vgpr_32 = V_MUL_U32_U24_e32 %3, %1, implicit $exec
+# CHECK: %7:vgpr_32 = V_AND_B32_e32 %6, %1, implicit $exec
+# CHECK: %10:vgpr_32 = V_MAX_I32_e32 %9, %1, implicit $exec
+# CHECK: %13:vgpr_32 = V_MIN_I32_e32 %12, %1, implicit $exec
+
+name: nonzero_old_is_identity_masks_partially_disabled_bctl0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 1, implicit $exec
+    %4:vgpr_32 = V_MUL_U32_U24_e32 %3, %1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 1, implicit $exec
+    %7:vgpr_32 = V_AND_B32_e32 %6, %1, implicit $exec
+
+    %8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 1, implicit $exec
+    %10:vgpr_32 = V_MAX_I32_e32 %9, %1, implicit $exec
+
+    %11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 1, implicit $exec
+    %13:vgpr_32 = V_MIN_I32_e32 %12, %1, implicit $exec
+...
+
+# when the DPP source isn't a src0 operand the operation should be commuted if possible
+# CHECK-LABEL: name: dpp_commute
+# CHECK: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+# CHECK: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
+# CHECK: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+# CHECK: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
+# CHECK: %16:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %19:vgpr_32 = V_ADD_I32_e32 5, %18, implicit-def $vcc, implicit $exec
+name: dpp_commute
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %4:vgpr_32 = V_MUL_U32_U24_e32 %1, %3, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 0, implicit $exec
+    %7:vgpr_32 = V_AND_B32_e32 %1, %6, implicit $exec
+
+    %8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
+    %10:vgpr_32 = V_MAX_I32_e32 %1, %9, implicit $exec
+
+    %11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 0, implicit $exec
+    %13:vgpr_32 = V_MIN_I32_e32 %1, %12, implicit $exec
+
+    %14:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %15:vgpr_32 = V_MOV_B32_dpp %14, %0, 1, 14, 15, 0, implicit $exec
+    %16:vgpr_32 = V_SUB_I32_e32 %1, %15, implicit-def $vcc, implicit $exec
+
+    ; this cannot be combined because immediate as src0 isn't commutable
+    %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %18:vgpr_32 = V_MOV_B32_dpp %17, %0, 1, 14, 15, 0, implicit $exec
+    %19:vgpr_32 = V_ADD_I32_e32 5, %18, implicit-def $vcc, implicit $exec
+...
+
+# check for floating point modifiers
+# CHECK-LABEL: name: add_f32_e64
+# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
+# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $exec
+# CHECK: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $exec
+# CHECK: %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec
+
+name: add_f32_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+
+    ; this shouldn't be combined as omod is set
+    %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
+
+    ; this should be combined as all modifiers are default
+    %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+    %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
+
+    ; this should be combined as modifiers other than abs|neg are default
+    %7:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+    %8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $exec
+
+    ; this shouldn't be combined as modifiers aren't abs|neg
+    %9:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+    %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec
+...
+
+# tests on sequences of dpp consumers
+# CHECK-LABEL: name: dpp_seq
+# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+# broken sequence:
+# CHECK: %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+
+name: dpp_seq
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec
+    %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec
+    %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
+
+    %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_I32_e32 %7, %1, implicit-def $vcc, implicit $exec
+    ; this breaks the sequence
+    %9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
+...
+
+# old reg def is in diff BB - cannot combine
+# CHECK-LABEL: name: old_in_diff_bb
+# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
+
+name: old_in_diff_bb
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_U32_e32 %3, %0, implicit $exec
+...
+
+# old reg def is in diff BB but bound_ctrl:0 - can combine
+# CHECK-LABEL: name: old_in_diff_bb_bctrl_zero
+# CHECK: %4:vgpr_32 = V_ADD_U32_dpp {{%[0-9]}}, %0, %1, 1, 15, 15, 1, implicit $exec
+
+name: old_in_diff_bb_bctrl_zero
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
+...
+
+# EXEC mask changed between def and use - cannot combine
+# CHECK-LABEL: name: exec_changed
+# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+
+name: exec_changed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
+    %5:sreg_64 = COPY $exec, implicit-def $exec
+    %6:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
+...
+
+# test if $old definition is correctly tracked through subreg manipulation pseudos
+
+# CHECK-LABEL: name: mul_old_subreg
+# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
+
+name:            mul_old_subreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
+    %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
+    %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
+    %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
+...
+
+# CHECK-LABEL: name: add_old_subreg
+# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
+
+name:            add_old_subreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
+    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
+...
+
+# CHECK-LABEL: name: add_old_subreg_undef
+# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 15, 15, 1, implicit $exec
+
+name:            add_old_subreg_undef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
+    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 15, 15, 1, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
diff --git a/test/CodeGen/AMDGPU/dpp_combine_subregs.mir b/test/CodeGen/AMDGPU/dpp_combine_subregs.mir
deleted file mode 100644
index 83f992f..0000000
--- a/test/CodeGen/AMDGPU/dpp_combine_subregs.mir
+++ /dev/null
@@ -1,143 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=tonga  -run-pass=gcn-dpp-combine  -o - %s | FileCheck %s
-
-# test if $old definition is correctly tracked through subreg manipulation pseudos
-
----
-# CHECK-LABEL: name: mul_old_subreg
-# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
-
-name:            mul_old_subreg
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64 }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vgpr_32 }
-  - { id: 4, class: vreg_64 }
-  - { id: 5, class: vreg_64 }
-  - { id: 6, class: vgpr_32 }
-  - { id: 7, class: vgpr_32 }
-
-liveins:
-  - { reg: '$vgpr0', virtual-reg: '%0' }
-  - { reg: '$vgpr1', virtual-reg: '%1' }
-body:             |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    %0:vreg_64 = COPY $vgpr0
-    %1:vgpr_32 = COPY $vgpr1
-    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
-    %4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
-    %5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
-    %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
-    %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
-...
-
-# CHECK-LABEL: name: add_old_subreg
-# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec
-
-name:            add_old_subreg
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64 }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: vgpr_32 }
-
-liveins:
-  - { reg: '$vgpr0', virtual-reg: '%0' }
-  - { reg: '$vgpr1', virtual-reg: '%1' }
-body:             |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    %0:vreg_64 = COPY $vgpr0
-    %1:vgpr_32 = COPY $vgpr1
-    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
-    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
-    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
-...
-
-# CHECK-LABEL: name: add_old_subreg_undef
-# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
-
-name:            add_old_subreg_undef
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64 }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: vgpr_32 }
-
-liveins:
-  - { reg: '$vgpr0', virtual-reg: '%0' }
-  - { reg: '$vgpr1', virtual-reg: '%1' }
-body:             |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    %0:vreg_64 = COPY $vgpr0
-    %1:vgpr_32 = COPY $vgpr1
-    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
-    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
-    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
-...
-
-# CHECK-LABEL: name: add_f32_e64
-# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
-# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
-# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 1, 1, 1, implicit $exec
-# CHECK: %7:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 1, 1, 1, implicit $exec
-# CHECK: %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
-
-name:            add_f32_e64
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32 }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vgpr_32 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: vgpr_32 }
-  - { id: 6, class: vgpr_32 }
-  - { id: 7, class: vgpr_32 }
-  - { id: 8, class: vgpr_32 }
-  - { id: 9, class: vgpr_32 }
-
-liveins:
-  - { reg: '$vgpr0', virtual-reg: '%0' }
-  - { reg: '$vgpr1', virtual-reg: '%1' }
-body:             |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    %0:vgpr_32 = COPY $vgpr0
-    %1:vgpr_32 = COPY $vgpr1
-    %2:vgpr_32 = IMPLICIT_DEF
-    %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
-
-    ; this shouldn't be combined as omod is set
-    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
-
-    %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
-
-    ; this should be combined as all modifiers are default
-    %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
-
-    ; this should be combined as modifiers other than abs|neg are default
-    %7:vgpr_32 = V_ADD_F32_e64 1, %5, 2, %0, 0, 0, implicit $exec
-
-    %8:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
-
-    ; this shouldn't be combined as modifiers aren't abs|neg
-    %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
-...