| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s |
| |
| ; Test that tree-structured min/max reductions form min3/max3 efficiently. |
| ; The key pattern is op(op(a,b), op(c,d)) which should become |
| ; op(op3(a,b,c), d) to enable further combining at higher tree levels. |
| |
| ; Basic 4-value tree: maxnum f32 |
| define float @v_max3_maxnum_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) |
| ret float %result |
| } |
| |
| ; 8-value tree: maxnum f32 |
| define float @v_max3_maxnum_tree8_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_max3_maxnum_tree8_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_max3_f32 v1, v4, v5, v6 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v7 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree8_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v4, v5, v6 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v7 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h) { |
| %ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %ef = call float @llvm.maxnum.f32(float %e, float %f) |
| %gh = call float @llvm.maxnum.f32(float %g, float %h) |
| %abcd = call float @llvm.maxnum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maxnum.f32(float %ef, float %gh) |
| %result = call float @llvm.maxnum.f32(float %abcd, float %efgh) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: maximum f32 (IEEE 2019) |
| define float @v_maximum3_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_maximum3_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maximum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maximum.f32(float %c, float %d) |
| %result = call float @llvm.maximum.f32(float %max.ab, float %max.cd) |
| ret float %result |
| } |
| |
| ; 8-value tree: maximum f32 (IEEE 2019) |
| define float @v_maximum3_tree8_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree8_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree8_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v7 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %result = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: minimum f32 (IEEE 2019) |
| define float @v_minimum3_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_minimum3_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_min_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc |
| ; GFX9-NEXT: v_min_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_minimum3_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_minimum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_minimum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %min.ab = call float @llvm.minimum.f32(float %a, float %b) |
| %min.cd = call float @llvm.minimum.f32(float %c, float %d) |
| %result = call float @llvm.minimum.f32(float %min.ab, float %min.cd) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: minnum f32 |
| define float @v_min3_minnum_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_min3_minnum_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_min3_minnum_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %min.ab = call float @llvm.minnum.f32(float %a, float %b) |
| %min.cd = call float @llvm.minnum.f32(float %c, float %d) |
| %result = call float @llvm.minnum.f32(float %min.ab, float %min.cd) |
| ret float %result |
| } |
| |
| ; 16-value tree: maximum f32, tests 3 levels of deferral |
| define float @v_maximum3_tree16_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree16_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v16, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v8, v9 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v9 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v5, v10, v11 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v11 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v6, v12, v13 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v13 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v7, v14, v15 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v15 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree16_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v8, v9, v10 |
| ; GFX1250-NEXT: v_maximum3_f32 v2, v4, v5, v6 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: v_maximum_f32 v1, v1, v11 |
| ; GFX1250-NEXT: v_maximum3_f32 v3, v12, v13, v14 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v2, v7 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v1, v3, v15 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h, |
| float %i, float %j, float %k, float %l, |
| float %m, float %n, float %o, float %p) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %ij = call float @llvm.maximum.f32(float %i, float %j) |
| %kl = call float @llvm.maximum.f32(float %k, float %l) |
| %mn = call float @llvm.maximum.f32(float %m, float %n) |
| %op = call float @llvm.maximum.f32(float %o, float %p) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %ijkl = call float @llvm.maximum.f32(float %ij, float %kl) |
| %mnop = call float @llvm.maximum.f32(float %mn, float %op) |
| %abcdefgh = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| %ijklmnop = call float @llvm.maximum.f32(float %ijkl, float %mnop) |
| %result = call float @llvm.maximum.f32(float %abcdefgh, float %ijklmnop) |
| ret float %result |
| } |
| |
| ; Unbalanced tree: left side is tree, right side is leaf |
| define float @v_maximum3_tree_unbalanced_f32(float %a, float %b, float %c, float %d, float %e) { |
| ; GFX9-LABEL: v_maximum3_tree_unbalanced_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v5, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v0, v4 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree_unbalanced_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v4 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %result = call float @llvm.maximum.f32(float %abcd, float %e) |
| ret float %result |
| } |
| |
| ; Multi-use: one side has multiple uses, should NOT trigger tree combine |
| define float @v_max3_maxnum_tree4_multi_use(float %a, float %b, float %c, float %d, ptr addrspace(1) %out) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_multi_use: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v2, v3 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: global_store_dword v[4:5], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_multi_use: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v2, v2, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v3 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) |
| store float %max.cd, ptr addrspace(1) %out |
| ret float %result |
| } |
| |
| ; 8-value tree: left subtree single-use, right subtree multi-use. |
| ; Left subtree should be tree-combined. Right subtree can't (multi-use), |
| ; so existing combine absorbs it. Tests asymmetric deferral behavior. |
| define float @v_maximum3_tree8_asymmetric_use(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree8_asymmetric_use: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v10, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v11, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v2, vcc |
| ; GFX9-NEXT: global_store_dword v[8:9], v1, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree8_asymmetric_use: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v4, v4, v5, v6 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v1, v4, v7 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1 |
| ; GFX1250-NEXT: global_store_b32 v[8:9], v1, off |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h, |
| ptr addrspace(1) %out) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %result = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| store float %efgh, ptr addrspace(1) %out |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: maxnum f16 |
| define half @v_max3_maxnum_tree4_f16(half %a, half %b, half %c, half %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f16_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-FAKE16-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX1250-FAKE16: ; %bb.0: |
| ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 |
| ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v3, v3 |
| ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 |
| ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] |
| ; |
| ; GFX1250-REAL16-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX1250-REAL16: ; %bb.0: |
| ; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-REAL16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l |
| ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.h, v3.l, v3.l |
| ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h |
| ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call half @llvm.maxnum.f16(half %a, half %b) |
| %max.cd = call half @llvm.maxnum.f16(half %c, half %d) |
| %result = call half @llvm.maxnum.f16(half %max.ab, half %max.cd) |
| ret half %result |
| } |
| |
| ; Negative test: f64 has no max3/min3 on any target yet, tree combine must not fire |
| define double @v_no_max3_maxnum_tree4_f64(double %a, double %b, double %c, double %d) { |
| ; GFX9-LABEL: v_no_max3_maxnum_tree4_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] |
| ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] |
| ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] |
| ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_no_max3_maxnum_tree4_f64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[6:7] |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call double @llvm.maxnum.f64(double %a, double %b) |
| %max.cd = call double @llvm.maxnum.f64(double %c, double %d) |
| %result = call double @llvm.maxnum.f64(double %max.ab, double %max.cd) |
| ret double %result |
| } |
| |
| ; Negative test: bf16 is promoted to f32 with conversions, tree combine cannot apply |
| define bfloat @v_no_max3_maxnum_tree4_bf16(bfloat %a, bfloat %b, bfloat %c, bfloat %d) { |
| ; GFX9-LABEL: v_no_max3_maxnum_tree4_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 |
| ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_no_max3_maxnum_tree4_bf16: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v3 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v0, v0, v1 |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) |
| %max.cd = call bfloat @llvm.maxnum.bf16(bfloat %c, bfloat %d) |
| %result = call bfloat @llvm.maxnum.bf16(bfloat %max.ab, bfloat %max.cd) |
| ret bfloat %result |
| } |
| |
| declare float @llvm.maxnum.f32(float, float) |
| declare float @llvm.minnum.f32(float, float) |
| declare float @llvm.maximum.f32(float, float) |
| declare float @llvm.minimum.f32(float, float) |
| declare half @llvm.maxnum.f16(half, half) |
| declare double @llvm.maxnum.f64(double, double) |
| declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) |