blob: a8f5d1d1bc4b40cebf09480d63038a8a86850a3f [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
; Test that tree-structured min/max reductions form min3/max3 efficiently.
; The key pattern is op(op(a,b), op(c,d)) which should become
; op(op3(a,b,c), d) to enable further combining at higher tree levels.
; Basic 4-value tree: maxnum f32
define float @v_max3_maxnum_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v3, v3
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maxnum.f32(float %a, float %b)
%max.cd = call float @llvm.maxnum.f32(float %c, float %d)
%result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd)
ret float %result
}
; 8-value tree: maxnum f32
define float @v_max3_maxnum_tree8_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_max3_maxnum_tree8_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max3_f32 v1, v4, v5, v6
; GFX9-NEXT: v_max3_f32 v0, v0, v3, v1
; GFX9-NEXT: v_max_f32_e32 v1, v7, v7
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree8_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max3_num_f32 v1, v4, v5, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v3, v1
; GFX1250-NEXT: v_max_num_f32_e32 v1, v7, v7
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h) {
%ab = call float @llvm.maxnum.f32(float %a, float %b)
%cd = call float @llvm.maxnum.f32(float %c, float %d)
%ef = call float @llvm.maxnum.f32(float %e, float %f)
%gh = call float @llvm.maxnum.f32(float %g, float %h)
%abcd = call float @llvm.maxnum.f32(float %ab, float %cd)
%efgh = call float @llvm.maxnum.f32(float %ef, float %gh)
%result = call float @llvm.maxnum.f32(float %abcd, float %efgh)
ret float %result
}
; Basic 4-value tree: maximum f32 (IEEE 2019)
define float @v_maximum3_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_maximum3_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v0, v0, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maximum.f32(float %a, float %b)
%max.cd = call float @llvm.maximum.f32(float %c, float %d)
%result = call float @llvm.maximum.f32(float %max.ab, float %max.cd)
ret float %result
}
; 8-value tree: maximum f32 (IEEE 2019)
define float @v_maximum3_tree8_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree8_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v8, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree8_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1
; GFX1250-NEXT: v_maximum_f32 v0, v0, v7
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%result = call float @llvm.maximum.f32(float %abcd, float %efgh)
ret float %result
}
; Basic 4-value tree: minimum f32 (IEEE 2019)
define float @v_minimum3_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_minimum3_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX9-NEXT: v_min_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_minimum3_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_minimum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_minimum_f32 v0, v0, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%min.ab = call float @llvm.minimum.f32(float %a, float %b)
%min.cd = call float @llvm.minimum.f32(float %c, float %d)
%result = call float @llvm.minimum.f32(float %min.ab, float %min.cd)
ret float %result
}
; Basic 4-value tree: minnum f32
define float @v_min3_minnum_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_min3_minnum_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v3, v3
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_min3_minnum_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%min.ab = call float @llvm.minnum.f32(float %a, float %b)
%min.cd = call float @llvm.minnum.f32(float %c, float %d)
%result = call float @llvm.minnum.f32(float %min.ab, float %min.cd)
ret float %result
}
; 16-value tree: maximum f32, tests 3 levels of deferral
define float @v_maximum3_tree16_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v16, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v8, v9
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v9
; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v5, v10, v11
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v11
; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
; GFX9-NEXT: v_max_f32_e32 v6, v12, v13
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v13
; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
; GFX9-NEXT: v_max_f32_e32 v7, v14, v15
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v15
; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
; GFX9-NEXT: v_max_f32_e32 v8, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree16_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6
; GFX1250-NEXT: v_maximum3_f32 v2, v8, v9, v10
; GFX1250-NEXT: v_maximum3_f32 v4, v12, v13, v14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1
; GFX1250-NEXT: v_maximum3_f32 v1, v2, v11, v4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v7, v1
; GFX1250-NEXT: v_maximum_f32 v0, v0, v15
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h,
float %i, float %j, float %k, float %l,
float %m, float %n, float %o, float %p) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%ij = call float @llvm.maximum.f32(float %i, float %j)
%kl = call float @llvm.maximum.f32(float %k, float %l)
%mn = call float @llvm.maximum.f32(float %m, float %n)
%op = call float @llvm.maximum.f32(float %o, float %p)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%ijkl = call float @llvm.maximum.f32(float %ij, float %kl)
%mnop = call float @llvm.maximum.f32(float %mn, float %op)
%abcdefgh = call float @llvm.maximum.f32(float %abcd, float %efgh)
%ijklmnop = call float @llvm.maximum.f32(float %ijkl, float %mnop)
%result = call float @llvm.maximum.f32(float %abcdefgh, float %ijklmnop)
ret float %result
}
; Unbalanced tree: left side is tree, right side is leaf
define float @v_maximum3_tree_unbalanced_f32(float %a, float %b, float %c, float %d, float %e) {
; GFX9-LABEL: v_maximum3_tree_unbalanced_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v0, v4
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree_unbalanced_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum_f32 v0, v0, v1
; GFX1250-NEXT: v_maximum_f32 v1, v2, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%result = call float @llvm.maximum.f32(float %abcd, float %e)
ret float %result
}
; Multi-use: one side has multiple uses, should NOT trigger tree combine
define float @v_max3_maxnum_tree4_multi_use(float %a, float %b, float %c, float %d, ptr addrspace(1) %out) {
; GFX9-LABEL: v_max3_maxnum_tree4_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: v_max_f32_e32 v2, v2, v3
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: global_store_dword v[4:5], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_multi_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v2, v2, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v3
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maxnum.f32(float %a, float %b)
%max.cd = call float @llvm.maxnum.f32(float %c, float %d)
%result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd)
store float %max.cd, ptr addrspace(1) %out
ret float %result
}
; 8-value tree: left subtree single-use, right subtree multi-use.
; Left subtree should be tree-combined. Right subtree can't (multi-use),
; so existing combine absorbs it. Tests asymmetric deferral behavior.
define float @v_maximum3_tree8_asymmetric_use(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree8_asymmetric_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v10, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v11, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v2, vcc
; GFX9-NEXT: global_store_dword v[8:9], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree8_asymmetric_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v4, v4, v5, v6
; GFX1250-NEXT: v_maximum_f32 v0, v0, v1
; GFX1250-NEXT: v_maximum_f32 v1, v2, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v2, v4, v7
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: global_store_b32 v[8:9], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h,
ptr addrspace(1) %out) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%result = call float @llvm.maximum.f32(float %abcd, float %efgh)
store float %efgh, ptr addrspace(1) %out
ret float %result
}
; Basic 4-value tree: maxnum f16
define half @v_max3_maxnum_tree4_f16(half %a, half %b, half %c, half %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX9-NEXT: v_max_f16_e32 v1, v3, v3
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_max3_maxnum_tree4_f16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v3, v3
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-REAL16-LABEL: v_max3_maxnum_tree4_f16:
; GFX1250-REAL16: ; %bb.0:
; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0
; GFX1250-REAL16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.h, v3.l, v3.l
; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h
; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call half @llvm.maxnum.f16(half %a, half %b)
%max.cd = call half @llvm.maxnum.f16(half %c, half %d)
%result = call half @llvm.maxnum.f16(half %max.ab, half %max.cd)
ret half %result
}
; Negative test: f64 has no max3/min3 on any target yet, tree combine must not fire
define double @v_no_max3_maxnum_tree4_f64(double %a, double %b, double %c, double %d) {
; GFX9-LABEL: v_no_max3_maxnum_tree4_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_no_max3_maxnum_tree4_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[6:7]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call double @llvm.maxnum.f64(double %a, double %b)
%max.cd = call double @llvm.maxnum.f64(double %c, double %d)
%result = call double @llvm.maxnum.f64(double %max.ab, double %max.cd)
ret double %result
}
; Negative test: bf16 is promoted to f32 with conversions, tree combine cannot apply
define bfloat @v_no_max3_maxnum_tree4_bf16(bfloat %a, bfloat %b, bfloat %c, bfloat %d) {
; GFX9-LABEL: v_no_max3_maxnum_tree4_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_max_f32_e32 v1, v2, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_no_max3_maxnum_tree4_bf16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v0, v0, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
%max.cd = call bfloat @llvm.maxnum.bf16(bfloat %c, bfloat %d)
%result = call bfloat @llvm.maxnum.bf16(bfloat %max.ab, bfloat %max.cd)
ret bfloat %result
}
; Two-level ternary tree
define float @v_max3_maxnum_ternary_2level_f32(
; GFX9-LABEL: v_max3_maxnum_ternary_2level_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max3_f32 v1, v3, v4, v5
; GFX9-NEXT: v_max3_f32 v2, v6, v7, v8
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_ternary_2level_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max3_num_f32 v1, v3, v4, v5
; GFX1250-NEXT: v_max3_num_f32 v2, v6, v7, v8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %a, float %b, float %c,
float %d, float %e, float %f,
float %g, float %h, float %i) {
%ab = call float @llvm.maxnum.f32(float %a, float %b)
%A = call float @llvm.maxnum.f32(float %ab, float %c)
%de = call float @llvm.maxnum.f32(float %d, float %e)
%B = call float @llvm.maxnum.f32(float %de, float %f)
%gh = call float @llvm.maxnum.f32(float %g, float %h)
%C = call float @llvm.maxnum.f32(float %gh, float %i)
%AB = call float @llvm.maxnum.f32(float %A, float %B)
%R = call float @llvm.maxnum.f32(float %AB, float %C)
ret float %R
}
; Mixed ternary + binary: one operand is a 3-leaf ternary tree, the other is
; a 4-leaf binary tree.
define float @v_max3_maxnum_mixed_ternary_binary_f32(
; GFX9-LABEL: v_max3_maxnum_mixed_ternary_binary_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max3_f32 v1, v3, v4, v5
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_mixed_ternary_binary_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max3_num_f32 v1, v3, v4, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v6
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %a, float %b, float %c,
float %d, float %e, float %f, float %g) {
%ab = call float @llvm.maxnum.f32(float %a, float %b)
%A = call float @llvm.maxnum.f32(float %ab, float %c)
%de = call float @llvm.maxnum.f32(float %d, float %e)
%fg = call float @llvm.maxnum.f32(float %f, float %g)
%B = call float @llvm.maxnum.f32(float %de, float %fg)
%R = call float @llvm.maxnum.f32(float %A, float %B)
ret float %R
}
; 4-value balanced binary tree on <2 x float>
define <2 x float> @v_max3_maxnum_tree4_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v2, v4
; GFX9-NEXT: v_max_f32_e32 v2, v6, v6
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: v_max3_f32 v1, v1, v3, v5
; GFX9-NEXT: v_max_f32_e32 v2, v7, v7
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_v2f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v2, v4
; GFX1250-NEXT: v_max_num_f32_e32 v2, v6, v6
; GFX1250-NEXT: v_max3_num_f32 v1, v1, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_max_num_f32 v3, v7, v7 :: v_dual_max_num_f32 v0, v0, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b)
%max.cd = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %c, <2 x float> %d)
%result = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %max.ab, <2 x float> %max.cd)
ret <2 x float> %result
}
; 2-level ternary tree on <2 x float>
define <2 x float> @v_max3_maxnum_ternary_2level_v2f32(
; GFX9-LABEL: v_max3_maxnum_ternary_2level_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v1, v1, v3, v5
; GFX9-NEXT: v_max3_f32 v0, v0, v2, v4
; GFX9-NEXT: v_max3_f32 v2, v7, v9, v11
; GFX9-NEXT: v_max3_f32 v3, v6, v8, v10
; GFX9-NEXT: v_max3_f32 v4, v13, v15, v17
; GFX9-NEXT: v_max3_f32 v5, v12, v14, v16
; GFX9-NEXT: v_max3_f32 v0, v0, v3, v5
; GFX9-NEXT: v_max3_f32 v1, v1, v2, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_ternary_2level_v2f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v1, v1, v3, v5
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v2, v4
; GFX1250-NEXT: v_max3_num_f32 v2, v7, v9, v11
; GFX1250-NEXT: v_max3_num_f32 v3, v6, v8, v10
; GFX1250-NEXT: v_max3_num_f32 v4, v12, v14, v16
; GFX1250-NEXT: v_max3_num_f32 v5, v13, v15, v17
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v3, v4
; GFX1250-NEXT: v_max3_num_f32 v1, v1, v2, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
<2 x float> %a, <2 x float> %b, <2 x float> %c,
<2 x float> %d, <2 x float> %e, <2 x float> %f,
<2 x float> %g, <2 x float> %h, <2 x float> %i) {
%ab = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b)
%A = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %ab, <2 x float> %c)
%de = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %d, <2 x float> %e)
%B = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %de, <2 x float> %f)
%gh = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %g, <2 x float> %h)
%C = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %gh, <2 x float> %i)
%AB = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %A, <2 x float> %B)
%R = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %AB, <2 x float> %C)
ret <2 x float> %R
}
; 4-value balanced binary tree on <2 x half>
define <2 x half> @v_max3_maxnum_tree4_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
; GFX9-NEXT: v_pk_max_f16 v1, v2, v1
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_v2f16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2
; GFX1250-NEXT: v_pk_max_num_f16 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max.cd = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %d)
%result = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max.ab, <2 x half> %max.cd)
ret <2 x half> %result
}
; 2-level ternary tree on <2 x half>
define <2 x half> @v_max3_maxnum_ternary_2level_v2f16(
; GFX9-LABEL: v_max3_maxnum_ternary_2level_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v4, v4
; GFX9-NEXT: v_pk_max_f16 v2, v3, v3
; GFX9-NEXT: v_pk_max_f16 v1, v2, v1
; GFX9-NEXT: v_pk_max_f16 v2, v5, v5
; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
; GFX9-NEXT: v_pk_max_f16 v2, v7, v7
; GFX9-NEXT: v_pk_max_f16 v3, v6, v6
; GFX9-NEXT: v_pk_max_f16 v2, v3, v2
; GFX9-NEXT: v_pk_max_f16 v3, v8, v8
; GFX9-NEXT: v_pk_max_f16 v2, v2, v3
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_ternary_2level_v2f16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2
; GFX1250-NEXT: v_pk_max3_num_f16 v1, v3, v4, v5
; GFX1250-NEXT: v_pk_max3_num_f16 v2, v6, v7, v8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max3_num_f16 v0, v0, v1, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
<2 x half> %a, <2 x half> %b, <2 x half> %c,
<2 x half> %d, <2 x half> %e, <2 x half> %f,
<2 x half> %g, <2 x half> %h, <2 x half> %i) {
%ab = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%A = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %ab, <2 x half> %c)
%de = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %d, <2 x half> %e)
%B = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %de, <2 x half> %f)
%gh = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %g, <2 x half> %h)
%C = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %gh, <2 x half> %i)
%AB = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %A, <2 x half> %B)
%R = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %AB, <2 x half> %C)
ret <2 x half> %R
}
declare float @llvm.maxnum.f32(float, float)
declare float @llvm.minnum.f32(float, float)
declare float @llvm.maximum.f32(float, float)
declare float @llvm.minimum.f32(float, float)
declare half @llvm.maxnum.f16(half, half)
declare double @llvm.maxnum.f64(double, double)
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)