# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s


--- |

  define void @valu_dep_1() {
  ; CHECK-LABEL: valu_dep_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @valu_dep_2() {
  ; CHECK-LABEL: valu_dep_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @valu_dep_3() {
  ; CHECK-LABEL: valu_dep_3:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v2
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @valu_dep_4() {
  ; CHECK-LABEL: valu_dep_4:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v2
  ; CHECK-NEXT:    v_add_nc_u32_e32 v3, v3, v3
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_4)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @valu_dep_5() {
  ; CHECK-LABEL: valu_dep_5:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v2
  ; CHECK-NEXT:    v_add_nc_u32_e32 v3, v3, v3
  ; CHECK-NEXT:    v_add_nc_u32_e32 v4, v4, v4
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @trans32_dep_1() {
  ; CHECK-LABEL: trans32_dep_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_exp_f32_e32 v0, v0
  ; CHECK-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @trans32_dep_2() {
  ; CHECK-LABEL: trans32_dep_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_exp_f32_e32 v0, v0
  ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(TRANS32_DEP_2)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @trans32_dep_3() {
  ; CHECK-LABEL: trans32_dep_3:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_exp_f32_e32 v0, v0
  ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
  ; CHECK-NEXT:    v_exp_f32_e32 v2, v2
  ; CHECK-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @trans32_dep_4() {
  ; CHECK-LABEL: trans32_dep_4:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_exp_f32_e32 v0, v0
  ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
  ; CHECK-NEXT:    v_exp_f32_e32 v2, v2
  ; CHECK-NEXT:    v_exp_f32_e32 v3, v3
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @salu_cycle_1() {
  ; CHECK-LABEL: salu_cycle_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_mov_b32 s0, 0
  ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s0, v0
    ret void
  }

  define void @salu_cycle_2() {
  ; CHECK-LABEL: salu_cycle_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_mov_b32 s0, 0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s0, v0
    ret void
  }

  define void @valu_dep_1_same_trans32_dep_1() {
  ; CHECK-LABEL: valu_dep_1_same_trans32_dep_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_exp_f32_e32 v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v1
    ret void
  }

  define void @trans32_dep_1_only() {
  ; CHECK-LABEL: trans32_dep_1_only:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v1
    ret void
  }

  define void @valu_dep_1_same_salu_cycle_1() {
  ; CHECK-LABEL: valu_dep_1_same_salu_cycle_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_mov_b32 s0, 0
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, s0, v0
    ret void
  }

  define void @valu_dep_1_next_valu_dep_1() {
  ; CHECK-LABEL: valu_dep_1_next_valu_dep_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @valu_dep_2_next_valu_dep_2() {
  ; CHECK-LABEL: valu_dep_2_next_valu_dep_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
    ret void
  }

  define void @valu_dep_1_no_next_1() {
  ; CHECK-LABEL: valu_dep_1_no_next_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_mul_f32_e32 v1, v0, v0
  ; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v0
    ret void
  }

  define void @valu_dep_1_no_next_2() {
  ; CHECK-LABEL: valu_dep_1_no_next_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
  ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
    ret void
  }

  define void @implicit_cmp_cndmask() {
  ; CHECK-LABEL: implicit_cmp_cndmask:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_cmp_eq_i32_e32 vcc, v0, v1
  ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v4, vcc
    ret void
  }

  define void @explicit_cmp_cndmask() {
  ; CHECK-LABEL: explicit_cmp_cndmask:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_cmp_eq_i32_e64 s[0:1], v0, v1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
    ret void
  }

  define void @implicit_addc_addc() {
  ; CHECK-LABEL: implicit_addc_addc:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
  ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
    ret void
  }

  define void @explicit_addc_addc() {
  ; CHECK-LABEL: explicit_addc_addc:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_co_u32 v0, vcc, v0, v0
  ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
    ret void
  }

  define void @valu_dep_3_bundle() {
  ; CHECK-LABEL: valu_dep_3_bundle:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v1
  ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v2
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @if() {
  ; CHECK-LABEL: if:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_cbranch_vccz .LBB23_2
  ; CHECK-NEXT:  ; %bb.1:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:  .LBB23_2:
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @else() {
  ; CHECK-LABEL: else:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_cbranch_vccz .LBB24_2
  ; CHECK-NEXT:  ; %bb.1:
  ; CHECK-NEXT:    s_branch .LBB24_3
  ; CHECK-NEXT:  .LBB24_2:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:  .LBB24_3:
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @if_else() {
  ; CHECK-LABEL: if_else:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_cbranch_vccz .LBB25_2
  ; CHECK-NEXT:  ; %bb.1:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_branch .LBB25_3
  ; CHECK-NEXT:  .LBB25_2:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v1, v1
  ; CHECK-NEXT:  .LBB25_3:
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @loop_1() {
  ; CHECK-LABEL: loop_1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:  .LBB26_1: ; =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v0, v0
  ; CHECK-NEXT:    s_cbranch_vccz .LBB26_1
  ; CHECK-NEXT:  ; %bb.2:
    ret void
  }

  define void @loop_2() {
  ; CHECK-LABEL: loop_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
  ; CHECK-NEXT:    s_cbranch_vccz .LBB27_1
  ; CHECK-NEXT:  ; %bb.2:
    ret void
  }

  define void @sendmsg_rtn() {
  ; CHECK-LABEL: sendmsg_rtn:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
  ; CHECK-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
  ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
  ; CHECK-NEXT:    s_add_u32 s0, s0, s0
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @flat_load() {
  ; CHECK-LABEL: flat_load:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
  ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
  ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
  ; CHECK-NEXT:    flat_load_b32 v0, v[0:1]
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v2, v2
    ret void
  }

  define void @waitcnt_depctr() {
  ; CHECK-LABEL: waitcnt_depctr:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
  ; CHECK-NEXT:    s_waitcnt_depctr 0xfff
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @writelane1() {
  ; CHECK-LABEL: writelane1:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_writelane_b32 v0, s0, 0
  ; CHECK-NEXT:    v_writelane_b32 v0, s0, 1
  ; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
  ; CHECK-NEXT:    v_writelane_b32 v0, s0, 3
    ret void
  }

  define void @writelane2() {
  ; CHECK-LABEL: writelane2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_writelane_b32 v0, s0, 3
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v0
    ret void
  }

  define void @delay_alu() {
  ; CHECK-LABEL: delay_alu:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, s1
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
  ; CHECK-NEXT:    s_or_b32 s0, s0, s1
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
    ret void
  }

  define void @redundant_delay_alu() {
  ; CHECK-LABEL: redundant_delay_alu:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, s5
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
  ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], s6, s7
  ; CHECK-NEXT:    s_or_b32 s0, s0, s1
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
    ret void
  }

  define void @redundant_delay_alu_2() {
  ; CHECK-LABEL: redundant_delay_alu_2:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
  ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, s1
  ; CHECK-NEXT:    s_or_b32 s0, s0, s1
  ; CHECK-NEXT:    v_mul_f32_e64 v0, v0, v0
    ret void;
  }

  define void @trans64() {
  ; CHECK-LABEL: trans64:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    v_rcp_f64_e32 v[0:1], v[0:1]
  ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
    ret void;
  }
...

---
name: valu_dep_1
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_3
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_4
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
# completed already.
---
name: valu_dep_5
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
    $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
    $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_1
body: |
  bb.0:
    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_2
body: |
  bb.0:
    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_3
body: |
  bb.0:
    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
# completed already.
---
name: trans32_dep_4
body: |
  bb.0:
    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
    $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
    $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: salu_cycle_1
body: |
  bb.0:
    $sgpr0 = S_MOV_B32 0
    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
# already.
---
name: salu_cycle_2
body: |
  bb.0:
    $sgpr0 = S_MOV_B32 0
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_same_trans32_dep_1
body: |
  bb.0:
    $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...

# There's no need to encode the VALU depdendency because it will complete before
# the TRANS.
---
name: trans32_dep_1_only
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...

---
name: valu_dep_1_same_salu_cycle_1
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $sgpr0 = S_MOV_B32 0
    $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_next_valu_dep_1
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2_next_valu_dep_2
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...

# There's no need to encode a dependency for the second mul, because the
# dependency for the first mul has already guaranteed that the add has
# completed.
---
name: valu_dep_1_no_next_1
body: |
  bb.0:
    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
    $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
...

# There's no need to encode a dependency for the second add, because the
# dependency for the second mul has already guaranteed that a later VALU has
# completed.
---
name: valu_dep_1_no_next_2
body: |
  bb.0:
    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
    $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
...

# There are no wait states between an add/sub/cmp generating carry and an
# add/sub/cndmask that consumes it, so no need to encode a dependency.

---
name: implicit_cmp_cndmask
body: |
  bb.0:
    implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
...

# TODO: There should be no s_delay_alu here.
---
name: explicit_cmp_cndmask
body: |
  bb.0:
    $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
    $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
...

---
name: implicit_addc_addc
body: |
  bb.0:
    $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...

---
name: explicit_addc_addc
body: |
  bb.0:
    $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...

---
name: valu_dep_3_bundle
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    BUNDLE {
      $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
      $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
    }
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: if
body: |
  bb.0:
    S_CBRANCH_VCCZ %bb.2, implicit $vcc
  bb.1:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
  bb.2:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: else
body: |
  bb.0:
    S_CBRANCH_VCCZ %bb.2, implicit $vcc
  bb.1:
    S_BRANCH %bb.3
  bb.2:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
  bb.3:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: if_else
body: |
  bb.0:
    S_CBRANCH_VCCZ %bb.2, implicit $vcc
  bb.1:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    S_BRANCH %bb.3
  bb.2:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
  bb.3:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

# Dependency from outside the loop.
---
name: loop_1
body: |
  bb.0:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
  bb.1:
    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.2:
...

# Dependency from inside the loop.
---
name: loop_2
body: |
  bb.0:
  bb.1:
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.2:
...

# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
# to complete.
---
name: sendmsg_rtn
body: |
  bb.0:
    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
    $sgpr0 = S_SENDMSG_RTN_B32 128
    $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

# No VALU delay before or across FLAT because it waits for all outstanding VALU
# to complete.
---
name: flat_load
body: |
  bb.0:
    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
    $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
...

# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
# to complete.
---
name: waitcnt_depctr
body: |
  bb.0:
    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
    S_WAITCNT_DEPCTR 4095
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

# Check that no delays are emitted for writelane instructions.
---
name: writelane1
body: |
  bb.0:
    $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
    $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
    $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
...

# Check if a VALU delay is added after writelane.
---
name: writelane2
body: |
  bb.0:
    $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# Check if s_delay_alu is added
---
name: delay_alu
body: |
  bb.0:
    $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
    $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...
# Check if redundant delay_alu is removed
---
name: redundant_delay_alu
body: |
  bb.0:
    $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
    $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec
    $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...
# Check if redundant delay_alu is removed
---
name: redundant_delay_alu_2
body: |
  bb.0:
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
    $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
    $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
    $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...

# Check that F64 TRANS instructions are treated as VALU.
---
name: trans64
body: |
  bb.0:
    $vgpr0_vgpr1 = V_RCP_F64_e32 $vgpr0_vgpr1, implicit $exec, implicit $mode
    $vgpr0_vgpr1 = V_ADD_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
...
