| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s |
| |
| ; Test that tree-structured min/max reductions form min3/max3 efficiently. |
| ; The key pattern is op(op(a,b), op(c,d)) which should become |
| ; op(op3(a,b,c), d) to enable further combining at higher tree levels. |
| |
| ; Basic 4-value tree: maxnum f32 |
| define float @v_max3_maxnum_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) |
| ret float %result |
| } |
| |
| ; 8-value tree: maxnum f32 |
| define float @v_max3_maxnum_tree8_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_max3_maxnum_tree8_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max3_f32 v1, v4, v5, v6 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v3, v1 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v7, v7 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree8_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v4, v5, v6 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v3, v1 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v7, v7 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h) { |
| %ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %ef = call float @llvm.maxnum.f32(float %e, float %f) |
| %gh = call float @llvm.maxnum.f32(float %g, float %h) |
| %abcd = call float @llvm.maxnum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maxnum.f32(float %ef, float %gh) |
| %result = call float @llvm.maxnum.f32(float %abcd, float %efgh) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: maximum f32 (IEEE 2019) |
| define float @v_maximum3_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_maximum3_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maximum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maximum.f32(float %c, float %d) |
| %result = call float @llvm.maximum.f32(float %max.ab, float %max.cd) |
| ret float %result |
| } |
| |
| ; 8-value tree: maximum f32 (IEEE 2019) |
| define float @v_maximum3_tree8_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree8_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree8_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1 |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v7 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %result = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: minimum f32 (IEEE 2019) |
| define float @v_minimum3_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_minimum3_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_min_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc |
| ; GFX9-NEXT: v_min_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_minimum3_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_minimum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_minimum_f32 v0, v0, v3 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %min.ab = call float @llvm.minimum.f32(float %a, float %b) |
| %min.cd = call float @llvm.minimum.f32(float %c, float %d) |
| %result = call float @llvm.minimum.f32(float %min.ab, float %min.cd) |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: minnum f32 |
| define float @v_min3_minnum_tree4_f32(float %a, float %b, float %c, float %d) { |
| ; GFX9-LABEL: v_min3_minnum_tree4_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_min3_minnum_tree4_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %min.ab = call float @llvm.minnum.f32(float %a, float %b) |
| %min.cd = call float @llvm.minnum.f32(float %c, float %d) |
| %result = call float @llvm.minnum.f32(float %min.ab, float %min.cd) |
| ret float %result |
| } |
| |
| ; 16-value tree: maximum f32, tests 3 levels of deferral |
| define float @v_maximum3_tree16_f32(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree16_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v16, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v8, v9 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v9 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v5, v10, v11 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v11 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v6, v12, v13 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v13 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v7, v14, v15 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v15 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v2, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree16_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6 |
| ; GFX1250-NEXT: v_maximum3_f32 v2, v8, v9, v10 |
| ; GFX1250-NEXT: v_maximum3_f32 v4, v12, v13, v14 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1 |
| ; GFX1250-NEXT: v_maximum3_f32 v1, v2, v11, v4 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v7, v1 |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v15 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h, |
| float %i, float %j, float %k, float %l, |
| float %m, float %n, float %o, float %p) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %ij = call float @llvm.maximum.f32(float %i, float %j) |
| %kl = call float @llvm.maximum.f32(float %k, float %l) |
| %mn = call float @llvm.maximum.f32(float %m, float %n) |
| %op = call float @llvm.maximum.f32(float %o, float %p) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %ijkl = call float @llvm.maximum.f32(float %ij, float %kl) |
| %mnop = call float @llvm.maximum.f32(float %mn, float %op) |
| %abcdefgh = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| %ijklmnop = call float @llvm.maximum.f32(float %ijkl, float %mnop) |
| %result = call float @llvm.maximum.f32(float %abcdefgh, float %ijklmnop) |
| ret float %result |
| } |
| |
| ; Unbalanced tree: left side is tree, right side is leaf |
| define float @v_maximum3_tree_unbalanced_f32(float %a, float %b, float %c, float %d, float %e) { |
| ; GFX9-LABEL: v_maximum3_tree_unbalanced_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v5, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v0, v4 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree_unbalanced_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v1 |
| ; GFX1250-NEXT: v_maximum_f32 v1, v2, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v4 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %result = call float @llvm.maximum.f32(float %abcd, float %e) |
| ret float %result |
| } |
| |
| ; Multi-use: one side has multiple uses, should NOT trigger tree combine |
| define float @v_max3_maxnum_tree4_multi_use(float %a, float %b, float %c, float %d, ptr addrspace(1) %out) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_multi_use: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v2, v3 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: global_store_dword v[4:5], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_multi_use: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v2, v2, v2 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v3 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %max.cd = call float @llvm.maxnum.f32(float %c, float %d) |
| %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) |
| store float %max.cd, ptr addrspace(1) %out |
| ret float %result |
| } |
| |
| ; 8-value tree: left subtree single-use, right subtree multi-use. |
| ; Left subtree should be tree-combined. Right subtree can't (multi-use), |
| ; so existing combine absorbs it. Tests asymmetric deferral behavior. |
| define float @v_maximum3_tree8_asymmetric_use(float %a, float %b, float %c, float %d, |
| ; GFX9-LABEL: v_maximum3_tree8_asymmetric_use: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f32_e32 v10, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v11, 0x7fc00000 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc |
| ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v2, vcc |
| ; GFX9-NEXT: global_store_dword v[8:9], v1, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_maximum3_tree8_asymmetric_use: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_maximum3_f32 v4, v4, v5, v6 |
| ; GFX1250-NEXT: v_maximum_f32 v0, v0, v1 |
| ; GFX1250-NEXT: v_maximum_f32 v1, v2, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_maximum_f32 v2, v4, v7 |
| ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: global_store_b32 v[8:9], v2, off |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %e, float %f, float %g, float %h, |
| ptr addrspace(1) %out) { |
| %ab = call float @llvm.maximum.f32(float %a, float %b) |
| %cd = call float @llvm.maximum.f32(float %c, float %d) |
| %ef = call float @llvm.maximum.f32(float %e, float %f) |
| %gh = call float @llvm.maximum.f32(float %g, float %h) |
| %abcd = call float @llvm.maximum.f32(float %ab, float %cd) |
| %efgh = call float @llvm.maximum.f32(float %ef, float %gh) |
| %result = call float @llvm.maximum.f32(float %abcd, float %efgh) |
| store float %efgh, ptr addrspace(1) %out |
| ret float %result |
| } |
| |
| ; Basic 4-value tree: maxnum f16 |
| define half @v_max3_maxnum_tree4_f16(half %a, half %b, half %c, half %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max_f16_e32 v1, v3, v3 |
| ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-FAKE16-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX1250-FAKE16: ; %bb.0: |
| ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 |
| ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v3, v3 |
| ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 |
| ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] |
| ; |
| ; GFX1250-REAL16-LABEL: v_max3_maxnum_tree4_f16: |
| ; GFX1250-REAL16: ; %bb.0: |
| ; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-REAL16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l |
| ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.h, v3.l, v3.l |
| ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h |
| ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call half @llvm.maxnum.f16(half %a, half %b) |
| %max.cd = call half @llvm.maxnum.f16(half %c, half %d) |
| %result = call half @llvm.maxnum.f16(half %max.ab, half %max.cd) |
| ret half %result |
| } |
| |
| ; Negative test: f64 has no max3/min3 on any target yet, tree combine must not fire |
| define double @v_no_max3_maxnum_tree4_f64(double %a, double %b, double %c, double %d) { |
| ; GFX9-LABEL: v_no_max3_maxnum_tree4_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] |
| ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] |
| ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] |
| ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] |
| ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_no_max3_maxnum_tree4_f64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[6:7] |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call double @llvm.maxnum.f64(double %a, double %b) |
| %max.cd = call double @llvm.maxnum.f64(double %c, double %d) |
| %result = call double @llvm.maxnum.f64(double %max.ab, double %max.cd) |
| ret double %result |
| } |
| |
| ; Negative test: bf16 is promoted to f32 with conversions, tree combine cannot apply |
| define bfloat @v_no_max3_maxnum_tree4_bf16(bfloat %a, bfloat %b, bfloat %c, bfloat %d) { |
| ; GFX9-LABEL: v_no_max3_maxnum_tree4_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 |
| ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_no_max3_maxnum_tree4_bf16: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v3 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v0, v0, v1 |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 |
| ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 |
| ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) |
| %max.cd = call bfloat @llvm.maxnum.bf16(bfloat %c, bfloat %d) |
| %result = call bfloat @llvm.maxnum.bf16(bfloat %max.ab, bfloat %max.cd) |
| ret bfloat %result |
| } |
| |
| ; Two-level ternary tree |
| define float @v_max3_maxnum_ternary_2level_f32( |
| ; GFX9-LABEL: v_max3_maxnum_ternary_2level_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max3_f32 v1, v3, v4, v5 |
| ; GFX9-NEXT: v_max3_f32 v2, v6, v7, v8 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_ternary_2level_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v3, v4, v5 |
| ; GFX1250-NEXT: v_max3_num_f32 v2, v6, v7, v8 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %a, float %b, float %c, |
| float %d, float %e, float %f, |
| float %g, float %h, float %i) { |
| %ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %A = call float @llvm.maxnum.f32(float %ab, float %c) |
| %de = call float @llvm.maxnum.f32(float %d, float %e) |
| %B = call float @llvm.maxnum.f32(float %de, float %f) |
| %gh = call float @llvm.maxnum.f32(float %g, float %h) |
| %C = call float @llvm.maxnum.f32(float %gh, float %i) |
| %AB = call float @llvm.maxnum.f32(float %A, float %B) |
| %R = call float @llvm.maxnum.f32(float %AB, float %C) |
| ret float %R |
| } |
| |
| ; Mixed ternary + binary: one operand is a 3-leaf ternary tree, the other is |
| ; a 4-leaf binary tree. |
| define float @v_max3_maxnum_mixed_ternary_binary_f32( |
| ; GFX9-LABEL: v_max3_maxnum_mixed_ternary_binary_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 |
| ; GFX9-NEXT: v_max3_f32 v1, v3, v4, v5 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v6 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_mixed_ternary_binary_f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v3, v4, v5 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v6 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| float %a, float %b, float %c, |
| float %d, float %e, float %f, float %g) { |
| %ab = call float @llvm.maxnum.f32(float %a, float %b) |
| %A = call float @llvm.maxnum.f32(float %ab, float %c) |
| %de = call float @llvm.maxnum.f32(float %d, float %e) |
| %fg = call float @llvm.maxnum.f32(float %f, float %g) |
| %B = call float @llvm.maxnum.f32(float %de, float %fg) |
| %R = call float @llvm.maxnum.f32(float %A, float %B) |
| ret float %R |
| } |
| |
| ; 4-value balanced binary tree on <2 x float> |
| define <2 x float> @v_max3_maxnum_tree4_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v2, v4 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v6, v6 |
| ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 |
| ; GFX9-NEXT: v_max3_f32 v1, v1, v3, v5 |
| ; GFX9-NEXT: v_max_f32_e32 v2, v7, v7 |
| ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_v2f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v2, v4 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v2, v6, v6 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v1, v3, v5 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1250-NEXT: v_dual_max_num_f32 v3, v7, v7 :: v_dual_max_num_f32 v0, v0, v2 |
| ; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v3 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) |
| %max.cd = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %c, <2 x float> %d) |
| %result = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %max.ab, <2 x float> %max.cd) |
| ret <2 x float> %result |
| } |
| |
| ; 2-level ternary tree on <2 x float> |
| define <2 x float> @v_max3_maxnum_ternary_2level_v2f32( |
| ; GFX9-LABEL: v_max3_maxnum_ternary_2level_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_max3_f32 v1, v1, v3, v5 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v2, v4 |
| ; GFX9-NEXT: v_max3_f32 v2, v7, v9, v11 |
| ; GFX9-NEXT: v_max3_f32 v3, v6, v8, v10 |
| ; GFX9-NEXT: v_max3_f32 v4, v13, v15, v17 |
| ; GFX9-NEXT: v_max3_f32 v5, v12, v14, v16 |
| ; GFX9-NEXT: v_max3_f32 v0, v0, v3, v5 |
| ; GFX9-NEXT: v_max3_f32 v1, v1, v2, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_ternary_2level_v2f32: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v1, v3, v5 |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v2, v4 |
| ; GFX1250-NEXT: v_max3_num_f32 v2, v7, v9, v11 |
| ; GFX1250-NEXT: v_max3_num_f32 v3, v6, v8, v10 |
| ; GFX1250-NEXT: v_max3_num_f32 v4, v12, v14, v16 |
| ; GFX1250-NEXT: v_max3_num_f32 v5, v13, v15, v17 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v3, v4 |
| ; GFX1250-NEXT: v_max3_num_f32 v1, v1, v2, v5 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| <2 x float> %a, <2 x float> %b, <2 x float> %c, |
| <2 x float> %d, <2 x float> %e, <2 x float> %f, |
| <2 x float> %g, <2 x float> %h, <2 x float> %i) { |
| %ab = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) |
| %A = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %ab, <2 x float> %c) |
| %de = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %d, <2 x float> %e) |
| %B = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %de, <2 x float> %f) |
| %gh = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %g, <2 x float> %h) |
| %C = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %gh, <2 x float> %i) |
| %AB = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %A, <2 x float> %B) |
| %R = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %AB, <2 x float> %C) |
| ret <2 x float> %R |
| } |
| |
| ; 4-value balanced binary tree on <2 x half> |
| define <2 x half> @v_max3_maxnum_tree4_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { |
| ; GFX9-LABEL: v_max3_maxnum_tree4_v2f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v3, v3 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_tree4_v2f16: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_pk_max_num_f16 v1, v3, v3 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| %max.ab = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) |
| %max.cd = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %d) |
| %result = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max.ab, <2 x half> %max.cd) |
| ret <2 x half> %result |
| } |
| |
| ; 2-level ternary tree on <2 x half> |
| define <2 x half> @v_max3_maxnum_ternary_2level_v2f16( |
| ; GFX9-LABEL: v_max3_maxnum_ternary_2level_v2f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v2, v2 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v4, v4 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v3, v3 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v5, v5 |
| ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v7, v7 |
| ; GFX9-NEXT: v_pk_max_f16 v3, v6, v6 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v3, v2 |
| ; GFX9-NEXT: v_pk_max_f16 v3, v8, v8 |
| ; GFX9-NEXT: v_pk_max_f16 v2, v2, v3 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 |
| ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1250-LABEL: v_max3_maxnum_ternary_2level_v2f16: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2 |
| ; GFX1250-NEXT: v_pk_max3_num_f16 v1, v3, v4, v5 |
| ; GFX1250-NEXT: v_pk_max3_num_f16 v2, v6, v7, v8 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2 |
| ; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
| <2 x half> %a, <2 x half> %b, <2 x half> %c, |
| <2 x half> %d, <2 x half> %e, <2 x half> %f, |
| <2 x half> %g, <2 x half> %h, <2 x half> %i) { |
| %ab = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) |
| %A = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %ab, <2 x half> %c) |
| %de = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %d, <2 x half> %e) |
| %B = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %de, <2 x half> %f) |
| %gh = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %g, <2 x half> %h) |
| %C = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %gh, <2 x half> %i) |
| %AB = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %A, <2 x half> %B) |
| %R = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %AB, <2 x half> %C) |
| ret <2 x half> %R |
| } |
| |
| declare float @llvm.maxnum.f32(float, float) |
| declare float @llvm.minnum.f32(float, float) |
| declare float @llvm.maximum.f32(float, float) |
| declare float @llvm.minimum.f32(float, float) |
| declare half @llvm.maxnum.f16(half, half) |
| declare double @llvm.maxnum.f64(double, double) |
| declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) |
| declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) |
| declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) |