[AMDGPU] VOPD: AllowSameVGPR on GFX12 (#196198)

The hardware allows this.
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 213eec9..e3d31cb 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -187,7 +187,7 @@
     return std::nullopt;
 
   const GCNSubtarget &ST = TII.getSubtarget();
-  bool AllowSameVGPR = ST.hasGFX1250Insts();
+  bool AllowSameVGPR = ST.hasGFX12Insts();
 
   if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) {
     if (checkVOPDRegConstraints(TII, FirstMI, SecondMI, IsVOPD3, AllowSameVGPR))
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 6737e2c..4e3d794 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -16089,26 +16089,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16141,25 +16141,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16539,26 +16539,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16591,25 +16591,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16994,26 +16994,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -17046,25 +17046,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -18800,26 +18800,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -18853,25 +18853,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 61dd7be..40f58f5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -16089,26 +16089,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16141,25 +16141,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16539,26 +16539,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16591,25 +16591,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16994,26 +16994,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -17046,25 +17046,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -18800,26 +18800,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -18853,25 +18853,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 1924751..d2bc7b3 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -15508,26 +15508,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -15560,25 +15560,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -15958,26 +15958,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16010,25 +16010,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16413,26 +16413,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16465,25 +16465,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -18219,26 +18219,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -18272,25 +18272,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index fa10543..1c4c598 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -2103,16 +2103,14 @@
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
@@ -2769,39 +2767,38 @@
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index e3f3bc7..6e124f6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -2105,16 +2105,14 @@
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
@@ -2774,39 +2772,38 @@
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 5b41f57..9344391 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -15835,26 +15835,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -15887,25 +15887,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16342,26 +16342,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16394,25 +16394,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16851,26 +16851,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16903,25 +16903,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -18840,26 +18840,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -18893,25 +18893,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 7b529d1..b8217cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -15835,26 +15835,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -15887,25 +15887,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16342,26 +16342,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16394,25 +16394,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16851,26 +16851,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16903,25 +16903,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -18840,26 +18840,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -18893,25 +18893,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 2031374..4ab68e8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -16193,26 +16193,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16245,25 +16245,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16700,26 +16700,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -16752,25 +16752,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -17209,26 +17209,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -17261,25 +17261,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -19198,26 +19198,26 @@
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v4
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
@@ -19251,25 +19251,25 @@
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index b382e53..a19853a 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -203,16 +203,14 @@
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
@@ -722,36 +720,35 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -1477,25 +1474,24 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
@@ -2461,64 +2457,62 @@
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
@@ -2920,33 +2914,30 @@
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v9, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
@@ -3831,29 +3822,32 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v15, v14, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
@@ -5091,37 +5085,37 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5131,38 +5125,40 @@
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v18, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
@@ -5182,26 +5178,28 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -5209,17 +5207,16 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
@@ -12192,7 +12189,6 @@
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
@@ -12203,13 +12199,11 @@
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
@@ -12218,61 +12212,65 @@
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s17, v49, v133
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s17, v49, v133
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v54, v98
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s16, v37, v132
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s18, v53, v134
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
@@ -12282,12 +12280,12 @@
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v69, v144
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
@@ -12296,144 +12294,142 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v71, v145
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v83, v146
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v86, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v97, v54
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v101, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v103, v37
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v103, v37
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v115, v132
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v52
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v64
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v119, v49
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v68
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v48
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v131, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v80
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v64
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v66, v53
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v84
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v134
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v96
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v80
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v84
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v96
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v112
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v51, v65, v68 :: v_dual_lshlrev_b32 v50, 16, v15
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v116
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v67, v80 :: v_dual_lshlrev_b32 v101, 16, v54
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v112
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v118
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v116
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v118
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v128
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v34
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
@@ -12967,16 +12963,14 @@
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
@@ -13333,36 +13327,35 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -13831,25 +13824,24 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
@@ -14469,64 +14461,62 @@
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index bbb026c1..ba0617d 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -205,16 +205,14 @@
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
@@ -731,36 +729,35 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -1492,25 +1489,24 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
@@ -2481,64 +2477,62 @@
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
@@ -2943,33 +2937,30 @@
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v9, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
@@ -3857,29 +3848,32 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v15, v14, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
@@ -5119,37 +5113,37 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5159,38 +5153,40 @@
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v18, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
@@ -5210,26 +5206,28 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -5237,17 +5235,16 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
@@ -12225,7 +12222,6 @@
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
@@ -12236,13 +12232,11 @@
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
@@ -12251,61 +12245,65 @@
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s17, v49, v133
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s17, v49, v133
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v54, v98
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s16, v37, v132
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s18, v53, v134
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
@@ -12315,12 +12313,12 @@
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v69, v144
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
@@ -12329,144 +12327,142 @@
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v71, v145
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v83, v146
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v86, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v97, v54
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v101, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v103, v37
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v103, v37
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v115, v132
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v52
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v64
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v119, v49
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v68
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v48
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v131, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v80
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v64
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v66, v53
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v84
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v134
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v96
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v80
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v84
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v96
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v112
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v51, v65, v68 :: v_dual_lshlrev_b32 v50, 16, v15
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v116
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v67, v80 :: v_dual_lshlrev_b32 v101, 16, v54
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v112
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v118
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v116
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v118
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v128
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v130
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v34
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
@@ -13002,16 +12998,14 @@
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
@@ -13371,36 +13365,35 @@
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -13872,25 +13865,24 @@
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
@@ -14513,64 +14505,62 @@
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index f42806c..c18e53d 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -1,9 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefixes=SCHED,SCHED-GFX11 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1100 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefixes=SCHED,SCHED-GFX11 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1170 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefixes=SCHED,SCHED-GFX12 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX12 %s
 
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX12 %s
@@ -37,13 +37,13 @@
 body:             |
   bb.0:
 
-    ; SCHED-LABEL: name: vopd_schedule
-    ; SCHED: $vgpr0 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-LABEL: name: vopd_schedule
+    ; SCHED-GFX11: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
     ; PAIR-GFX1100-LABEL: name: vopd_schedule
     ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
@@ -59,12 +59,20 @@
     ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
+    ; SCHED-GFX12-LABEL: name: vopd_schedule
+    ; SCHED-GFX12: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
-    ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr3 = V_DUAL_ADD_F32_e32_X_SUB_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr1, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
     $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -209,8 +217,7 @@
     ; PAIR-GFX12-NEXT: $sgpr20 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr7, $vgpr6 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 killed $vgpr0, $vgpr3, $sgpr20, $vgpr3, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
@@ -495,26 +502,26 @@
   bb.0:
     liveins: $vcc_lo
 
-    ; SCHED-LABEL: name: vopd_schedule_unconstrained
-    ; SCHED: liveins: $vcc_lo
-    ; SCHED-NEXT: {{  $}}
-    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
-    ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-LABEL: name: vopd_schedule_unconstrained
+    ; SCHED-GFX11: liveins: $vcc_lo
+    ; SCHED-GFX11-NEXT: {{  $}}
+    ; SCHED-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr17 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
     ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX1100: liveins: $vcc_lo
@@ -552,23 +559,42 @@
     ; PAIR-GFX1170-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX1170-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
+    ; SCHED-GFX12-LABEL: name: vopd_schedule_unconstrained
+    ; SCHED-GFX12: liveins: $vcc_lo
+    ; SCHED-GFX12-NEXT: {{  $}}
+    ; SCHED-GFX12-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX12: liveins: $vcc_lo
     ; PAIR-GFX12-NEXT: {{  $}}
     ; PAIR-GFX12-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
-    ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
-    ; PAIR-GFX12-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr17 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_SUB_F32_e32_gfx12 10, $vgpr1, killed $vgpr2, $vgpr1, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr14, $vgpr11 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr19 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, killed $vgpr3, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr6, $vgpr15 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit killed $vcc_lo
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
     $vgpr2 = IMPLICIT_DEF
@@ -599,40 +625,40 @@
   bb.0:
     liveins: $vcc_lo
 
-    ; SCHED-LABEL: name: vopd_schedule_unconstrained_2
-    ; SCHED: liveins: $vcc_lo
-    ; SCHED-NEXT: {{  $}}
-    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr20 = IMPLICIT_DEF
-    ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr20, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; SCHED-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
-    ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-LABEL: name: vopd_schedule_unconstrained_2
+    ; SCHED-GFX11: liveins: $vcc_lo
+    ; SCHED-GFX11-NEXT: {{  $}}
+    ; SCHED-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr20 = IMPLICIT_DEF
+    ; SCHED-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr20, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ; SCHED-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
     ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX1100: liveins: $vcc_lo
@@ -688,30 +714,62 @@
     ; PAIR-GFX1170-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX1170-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ;
+    ; SCHED-GFX12-LABEL: name: vopd_schedule_unconstrained_2
+    ; SCHED-GFX12: liveins: $vcc_lo
+    ; SCHED-GFX12-NEXT: {{  $}}
+    ; SCHED-GFX12-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr20 = IMPLICIT_DEF
+    ; SCHED-GFX12-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr20, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ; SCHED-GFX12-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-GFX12-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX12: liveins: $vcc_lo
     ; PAIR-GFX12-NEXT: {{  $}}
     ; PAIR-GFX12-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
-    ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr20 = IMPLICIT_DEF
-    ; PAIR-GFX12-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr17 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_SUB_F32_e32_gfx12 10, $vgpr1, killed $vgpr2, $vgpr1, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr20, $vgpr31 = V_DUAL_FMAC_F32_e32_X_ADD_F32_e32_gfx12 10, $vgpr1, killed $vgpr20, $vgpr1, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr21 = V_DUAL_ADD_F32_e32_X_SUB_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr1, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr10 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr6 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr15, $vgpr16 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr29, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr35, $vgpr28 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr33, $vgpr22 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr1, killed $vgpr2, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit killed $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $vgpr0 = IMPLICIT_DEF