| # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s |
| |
| |
| --- | |
| |
| define void @valu_dep_1() { |
| ; CHECK-LABEL: valu_dep_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_2() { |
| ; CHECK-LABEL: valu_dep_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_3() { |
| ; CHECK-LABEL: valu_dep_3: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_4() { |
| ; CHECK-LABEL: valu_dep_4: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_5() { |
| ; CHECK-LABEL: valu_dep_5: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @trans32_dep_1() { |
| ; CHECK-LABEL: trans32_dep_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @trans32_dep_2() { |
| ; CHECK-LABEL: trans32_dep_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_exp_f32_e32 v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @trans32_dep_3() { |
| ; CHECK-LABEL: trans32_dep_3: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_exp_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_exp_f32_e32 v2, v2 |
| ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @trans32_dep_4() { |
| ; CHECK-LABEL: trans32_dep_4: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_exp_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_exp_f32_e32 v2, v2 |
| ; CHECK-NEXT: v_exp_f32_e32 v3, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @salu_cycle_1() { |
| ; CHECK-LABEL: salu_cycle_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_mov_b32 s0, 0 |
| ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ret void |
| } |
| |
| define void @salu_cycle_2() { |
| ; CHECK-LABEL: salu_cycle_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_mov_b32 s0, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_1_same_trans32_dep_1() { |
| ; CHECK-LABEL: valu_dep_1_same_trans32_dep_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ret void |
| } |
| |
| define void @trans32_dep_1_only() { |
| ; CHECK-LABEL: trans32_dep_1_only: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_exp_f32_e32 v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ret void |
| } |
| |
| define void @valu_dep_1_same_salu_cycle_1() { |
| ; CHECK-LABEL: valu_dep_1_same_salu_cycle_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_1_next_valu_dep_1() { |
| ; CHECK-LABEL: valu_dep_1_next_valu_dep_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_2_next_valu_dep_2() { |
| ; CHECK-LABEL: valu_dep_2_next_valu_dep_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ret void |
| } |
| |
| define void @valu_dep_1_no_next_1() { |
| ; CHECK-LABEL: valu_dep_1_no_next_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0 |
| ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0 |
| ret void |
| } |
| |
| define void @valu_dep_1_no_next_2() { |
| ; CHECK-LABEL: valu_dep_1_no_next_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @implicit_cmp_cndmask() { |
| ; CHECK-LABEL: implicit_cmp_cndmask: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc |
| ret void |
| } |
| |
| define void @explicit_cmp_cndmask() { |
| ; CHECK-LABEL: explicit_cmp_cndmask: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] |
| ret void |
| } |
| |
| define void @implicit_addc_addc() { |
| ; CHECK-LABEL: implicit_addc_addc: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc |
| ret void |
| } |
| |
| define void @explicit_addc_addc() { |
| ; CHECK-LABEL: explicit_addc_addc: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc |
| ret void |
| } |
| |
| define void @valu_dep_3_bundle() { |
| ; CHECK-LABEL: valu_dep_3_bundle: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @if() { |
| ; CHECK-LABEL: if: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_cbranch_vccz .LBB23_2 |
| ; CHECK-NEXT: ; %bb.1: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: .LBB23_2: |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @else() { |
| ; CHECK-LABEL: else: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_cbranch_vccz .LBB24_2 |
| ; CHECK-NEXT: ; %bb.1: |
| ; CHECK-NEXT: s_branch .LBB24_3 |
| ; CHECK-NEXT: .LBB24_2: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: .LBB24_3: |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @if_else() { |
| ; CHECK-LABEL: if_else: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_cbranch_vccz .LBB25_2 |
| ; CHECK-NEXT: ; %bb.1: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_branch .LBB25_3 |
| ; CHECK-NEXT: .LBB25_2: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1 |
| ; CHECK-NEXT: .LBB25_3: |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @loop_1() { |
| ; CHECK-LABEL: loop_1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0 |
| ; CHECK-NEXT: s_cbranch_vccz .LBB26_1 |
| ; CHECK-NEXT: ; %bb.2: |
| ret void |
| } |
| |
| define void @loop_2() { |
| ; CHECK-LABEL: loop_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ; CHECK-NEXT: s_cbranch_vccz .LBB27_1 |
| ; CHECK-NEXT: ; %bb.2: |
| ret void |
| } |
| |
| define void @sendmsg_rtn() { |
| ; CHECK-LABEL: sendmsg_rtn: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) |
| ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; CHECK-NEXT: s_add_u32 s0, s0, s0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @flat_load() { |
| ; CHECK-LABEL: flat_load: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0 |
| ; CHECK-NEXT: flat_load_b32 v0, v[0:1] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2 |
| ret void |
| } |
| |
| define void @waitcnt_depctr() { |
| ; CHECK-LABEL: waitcnt_depctr: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_waitcnt_depctr 0xfff |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @writelane1() { |
| ; CHECK-LABEL: writelane1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_writelane_b32 v0, s0, 0 |
| ; CHECK-NEXT: v_writelane_b32 v0, s0, 1 |
| ; CHECK-NEXT: v_writelane_b32 v0, s0, 2 |
| ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 |
| ret void |
| } |
| |
| define void @writelane2() { |
| ; CHECK-LABEL: writelane2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 |
| ret void |
| } |
| |
| define void @delay_alu() { |
| ; CHECK-LABEL: delay_alu: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1 |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ; CHECK-NEXT: s_or_b32 s0, s0, s1 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ret void |
| } |
| |
| define void @redundant_delay_alu() { |
| ; CHECK-LABEL: redundant_delay_alu: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5 |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7 |
| ; CHECK-NEXT: s_or_b32 s0, s0, s1 |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ret void |
| } |
| |
| define void @redundant_delay_alu_2() { |
| ; CHECK-LABEL: redundant_delay_alu_2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1 |
| ; CHECK-NEXT: s_or_b32 s0, s0, s1 |
| ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 |
| ret void; |
| } |
| ... |
| |
| --- |
| name: valu_dep_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_2 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_3 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_4 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec |
| $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # There's no encoding for VALU_DEP_5. A normal VALU instruction will have |
| # completed already. |
| --- |
| name: valu_dep_5 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec |
| $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec |
| $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: trans32_dep_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: trans32_dep_2 |
| body: | |
| bb.0: |
| $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: trans32_dep_3 |
| body: | |
| bb.0: |
| $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode |
| $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have |
| # completed already. |
| --- |
| name: trans32_dep_4 |
| body: | |
| bb.0: |
| $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode |
| $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode |
| $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: salu_cycle_1 |
| body: | |
| bb.0: |
| $sgpr0 = S_MOV_B32 0 |
| $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # There's no need for SALU_CYCLE_2 here because the s_mov will have completed |
| # already. |
| --- |
| name: salu_cycle_2 |
| body: | |
| bb.0: |
| $sgpr0 = S_MOV_B32 0 |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_1_same_trans32_dep_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec |
| ... |
| |
| # There's no need to encode the VALU depdendency because it will complete before |
| # the TRANS. |
| --- |
| name: trans32_dep_1_only |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_1_same_salu_cycle_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $sgpr0 = S_MOV_B32 0 |
| $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_1_next_valu_dep_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_2_next_valu_dep_2 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| ... |
| |
| # There's no need to encode a dependency for the second mul, because the |
| # dependency for the first mul has already guaranteed that the add has |
| # completed. |
| --- |
| name: valu_dep_1_no_next_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode |
| $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode |
| ... |
| |
| # There's no need to encode a dependency for the second add, because the |
| # dependency for the second mul has already guaranteed that a later VALU has |
| # completed. |
| --- |
| name: valu_dep_1_no_next_2 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode |
| $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode |
| $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode |
| $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode |
| ... |
| |
| # There are no wait states between an add/sub/cmp generating carry and an |
| # add/sub/cndmask that consumes it, so no need to encode a dependency. |
| |
| --- |
| name: implicit_cmp_cndmask |
| body: | |
| bb.0: |
| implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec |
| $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec |
| ... |
| |
| # TODO: There should be no s_delay_alu here. |
| --- |
| name: explicit_cmp_cndmask |
| body: | |
| bb.0: |
| $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec |
| $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec |
| ... |
| |
| --- |
| name: implicit_addc_addc |
| body: | |
| bb.0: |
| $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec |
| $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec |
| ... |
| |
| --- |
| name: explicit_addc_addc |
| body: | |
| bb.0: |
| $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec |
| $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec |
| ... |
| |
| --- |
| name: valu_dep_3_bundle |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| BUNDLE { |
| $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec |
| } |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: if |
| body: | |
| bb.0: |
| S_CBRANCH_VCCZ %bb.2, implicit $vcc |
| bb.1: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| bb.2: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: else |
| body: | |
| bb.0: |
| S_CBRANCH_VCCZ %bb.2, implicit $vcc |
| bb.1: |
| S_BRANCH %bb.3 |
| bb.2: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| bb.3: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| --- |
| name: if_else |
| body: | |
| bb.0: |
| S_CBRANCH_VCCZ %bb.2, implicit $vcc |
| bb.1: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| S_BRANCH %bb.3 |
| bb.2: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec |
| bb.3: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # Dependency from outside the loop. |
| --- |
| name: loop_1 |
| body: | |
| bb.0: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| bb.1: |
| $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| S_CBRANCH_VCCZ %bb.1, implicit $vcc |
| bb.2: |
| ... |
| |
| # Dependency from inside the loop. |
| --- |
| name: loop_2 |
| body: | |
| bb.0: |
| bb.1: |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| S_CBRANCH_VCCZ %bb.1, implicit $vcc |
| bb.2: |
| ... |
| |
| # No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU |
| # to complete. |
| --- |
| name: sendmsg_rtn |
| body: | |
| bb.0: |
| $vgpr0 = V_MOV_B32_e32 0, implicit $exec |
| $sgpr0 = S_SENDMSG_RTN_B32 128 |
| $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # No VALU delay before or across FLAT because it waits for all outstanding VALU |
| # to complete. |
| --- |
| name: flat_load |
| body: | |
| bb.0: |
| $vgpr0 = V_MOV_B32_e32 0, implicit $exec |
| $vgpr1 = V_MOV_B32_e32 0, implicit $exec |
| $vgpr2 = V_MOV_B32_e32 0, implicit $exec |
| $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr |
| $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec |
| ... |
| |
| # No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU |
| # to complete. |
| --- |
| name: waitcnt_depctr |
| body: | |
| bb.0: |
| $vgpr0 = V_MOV_B32_e32 0, implicit $exec |
| S_WAITCNT_DEPCTR 4095 |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| |
| # Check that no delays are emitted for writelane instructions. |
| --- |
| name: writelane1 |
| body: | |
| bb.0: |
| $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0 |
| $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0 |
| $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0 |
| $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0 |
| ... |
| |
| # Check if a VALU delay is added after writelane. |
| --- |
| name: writelane2 |
| body: | |
| bb.0: |
| $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0 |
| $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec |
| ... |
| # Check if s_delay_alu is added |
| --- |
| name: delay_alu |
| body: | |
| bb.0: |
| $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| ... |
| # Check if redundant delay_alu is removed |
| --- |
| name: redundant_delay_alu |
| body: | |
| bb.0: |
| $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec |
| $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| ... |
| # Check if redundant delay_alu is removed |
| --- |
| name: redundant_delay_alu_2 |
| body: | |
| bb.0: |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec |
| $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc |
| $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec |
| ... |
| |