blob: 88c38ccbb1508b634101e488bf31cbc93f04e64f [file]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
; Test that tree-structured min/max reductions form min3/max3 efficiently.
; The key pattern is op(op(a,b), op(c,d)) which should become
; op(op3(a,b,c), d) to enable further combining at higher tree levels.
; Basic 4-value tree: maxnum f32
define float @v_max3_maxnum_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v3, v3
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maxnum.f32(float %a, float %b)
%max.cd = call float @llvm.maxnum.f32(float %c, float %d)
%result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd)
ret float %result
}
; 8-value tree: maxnum f32
define float @v_max3_maxnum_tree8_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_max3_maxnum_tree8_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v3, v3
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_max3_f32 v1, v4, v5, v6
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree8_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: v_max3_num_f32 v1, v4, v5, v6
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v7
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h) {
%ab = call float @llvm.maxnum.f32(float %a, float %b)
%cd = call float @llvm.maxnum.f32(float %c, float %d)
%ef = call float @llvm.maxnum.f32(float %e, float %f)
%gh = call float @llvm.maxnum.f32(float %g, float %h)
%abcd = call float @llvm.maxnum.f32(float %ab, float %cd)
%efgh = call float @llvm.maxnum.f32(float %ef, float %gh)
%result = call float @llvm.maxnum.f32(float %abcd, float %efgh)
ret float %result
}
; Basic 4-value tree: maximum f32 (IEEE 2019)
define float @v_maximum3_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_maximum3_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v0, v0, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maximum.f32(float %a, float %b)
%max.cd = call float @llvm.maximum.f32(float %c, float %d)
%result = call float @llvm.maximum.f32(float %max.ab, float %max.cd)
ret float %result
}
; 8-value tree: maximum f32 (IEEE 2019)
define float @v_maximum3_tree8_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree8_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v8, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree8_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v0, v0, v3
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v7
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%result = call float @llvm.maximum.f32(float %abcd, float %efgh)
ret float %result
}
; Basic 4-value tree: minimum f32 (IEEE 2019)
define float @v_minimum3_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_minimum3_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX9-NEXT: v_min_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_minimum3_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_minimum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_minimum_f32 v0, v0, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%min.ab = call float @llvm.minimum.f32(float %a, float %b)
%min.cd = call float @llvm.minimum.f32(float %c, float %d)
%result = call float @llvm.minimum.f32(float %min.ab, float %min.cd)
ret float %result
}
; Basic 4-value tree: minnum f32
define float @v_min3_minnum_tree4_f32(float %a, float %b, float %c, float %d) {
; GFX9-LABEL: v_min3_minnum_tree4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v3, v3
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_min3_minnum_tree4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%min.ab = call float @llvm.minnum.f32(float %a, float %b)
%min.cd = call float @llvm.minnum.f32(float %c, float %d)
%result = call float @llvm.minnum.f32(float %min.ab, float %min.cd)
ret float %result
}
; 16-value tree: maximum f32, tests 3 levels of deferral
define float @v_maximum3_tree16_f32(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v16, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v8, v9
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v9
; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v5, v10, v11
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v11
; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
; GFX9-NEXT: v_max_f32_e32 v6, v12, v13
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v13
; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
; GFX9-NEXT: v_max_f32_e32 v7, v14, v15
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v15
; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
; GFX9-NEXT: v_max_f32_e32 v8, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree16_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: v_maximum3_f32 v1, v8, v9, v10
; GFX1250-NEXT: v_maximum3_f32 v2, v4, v5, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_maximum_f32 v0, v0, v3
; GFX1250-NEXT: v_maximum_f32 v1, v1, v11
; GFX1250-NEXT: v_maximum3_f32 v3, v12, v13, v14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v2, v7
; GFX1250-NEXT: v_maximum3_f32 v1, v1, v3, v15
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h,
float %i, float %j, float %k, float %l,
float %m, float %n, float %o, float %p) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%ij = call float @llvm.maximum.f32(float %i, float %j)
%kl = call float @llvm.maximum.f32(float %k, float %l)
%mn = call float @llvm.maximum.f32(float %m, float %n)
%op = call float @llvm.maximum.f32(float %o, float %p)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%ijkl = call float @llvm.maximum.f32(float %ij, float %kl)
%mnop = call float @llvm.maximum.f32(float %mn, float %op)
%abcdefgh = call float @llvm.maximum.f32(float %abcd, float %efgh)
%ijklmnop = call float @llvm.maximum.f32(float %ijkl, float %mnop)
%result = call float @llvm.maximum.f32(float %abcdefgh, float %ijklmnop)
ret float %result
}
; Unbalanced tree: left side is tree, right side is leaf
define float @v_maximum3_tree_unbalanced_f32(float %a, float %b, float %c, float %d, float %e) {
; GFX9-LABEL: v_maximum3_tree_unbalanced_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v0, v4
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree_unbalanced_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%result = call float @llvm.maximum.f32(float %abcd, float %e)
ret float %result
}
; Multi-use: one side has multiple uses, should NOT trigger tree combine
define float @v_max3_maxnum_tree4_multi_use(float %a, float %b, float %c, float %d, ptr addrspace(1) %out) {
; GFX9-LABEL: v_max3_maxnum_tree4_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: v_max_f32_e32 v2, v2, v3
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: global_store_dword v[4:5], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_max3_maxnum_tree4_multi_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v2, v2, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v3
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call float @llvm.maxnum.f32(float %a, float %b)
%max.cd = call float @llvm.maxnum.f32(float %c, float %d)
%result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd)
store float %max.cd, ptr addrspace(1) %out
ret float %result
}
; 8-value tree: left subtree single-use, right subtree multi-use.
; Left subtree should be tree-combined. Right subtree can't (multi-use),
; so existing combine absorbs it. Tests asymmetric deferral behavior.
define float @v_maximum3_tree8_asymmetric_use(float %a, float %b, float %c, float %d,
; GFX9-LABEL: v_maximum3_tree8_asymmetric_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v10, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v11, 0x7fc00000
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v4, v5
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; GFX9-NEXT: v_max_f32_e32 v3, v6, v7
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; GFX9-NEXT: v_max_f32_e32 v4, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc
; GFX9-NEXT: v_max_f32_e32 v1, v2, v3
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v2, vcc
; GFX9-NEXT: global_store_dword v[8:9], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_maximum3_tree8_asymmetric_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_maximum3_f32 v4, v4, v5, v6
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_maximum_f32 v1, v4, v7
; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1
; GFX1250-NEXT: global_store_b32 v[8:9], v1, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
float %e, float %f, float %g, float %h,
ptr addrspace(1) %out) {
%ab = call float @llvm.maximum.f32(float %a, float %b)
%cd = call float @llvm.maximum.f32(float %c, float %d)
%ef = call float @llvm.maximum.f32(float %e, float %f)
%gh = call float @llvm.maximum.f32(float %g, float %h)
%abcd = call float @llvm.maximum.f32(float %ab, float %cd)
%efgh = call float @llvm.maximum.f32(float %ef, float %gh)
%result = call float @llvm.maximum.f32(float %abcd, float %efgh)
store float %efgh, ptr addrspace(1) %out
ret float %result
}
; Basic 4-value tree: maxnum f16
define half @v_max3_maxnum_tree4_f16(half %a, half %b, half %c, half %d) {
; GFX9-LABEL: v_max3_maxnum_tree4_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX9-NEXT: v_max_f16_e32 v1, v3, v3
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_max3_maxnum_tree4_f16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v3, v3
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-REAL16-LABEL: v_max3_maxnum_tree4_f16:
; GFX1250-REAL16: ; %bb.0:
; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0
; GFX1250-REAL16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.h, v3.l, v3.l
; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h
; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call half @llvm.maxnum.f16(half %a, half %b)
%max.cd = call half @llvm.maxnum.f16(half %c, half %d)
%result = call half @llvm.maxnum.f16(half %max.ab, half %max.cd)
ret half %result
}
; Negative test: f64 has no max3/min3 on any target yet, tree combine must not fire
define double @v_no_max3_maxnum_tree4_f64(double %a, double %b, double %c, double %d) {
; GFX9-LABEL: v_no_max3_maxnum_tree4_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_no_max3_maxnum_tree4_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[6:7]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call double @llvm.maxnum.f64(double %a, double %b)
%max.cd = call double @llvm.maxnum.f64(double %c, double %d)
%result = call double @llvm.maxnum.f64(double %max.ab, double %max.cd)
ret double %result
}
; Negative test: bf16 is promoted to f32 with conversions, tree combine cannot apply
define bfloat @v_no_max3_maxnum_tree4_bf16(bfloat %a, bfloat %b, bfloat %c, bfloat %d) {
; GFX9-LABEL: v_no_max3_maxnum_tree4_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_max_f32_e32 v1, v2, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_no_max3_maxnum_tree4_bf16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v0, v0, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max.ab = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
%max.cd = call bfloat @llvm.maxnum.bf16(bfloat %c, bfloat %d)
%result = call bfloat @llvm.maxnum.bf16(bfloat %max.ab, bfloat %max.cd)
ret bfloat %result
}
declare float @llvm.maxnum.f32(float, float)
declare float @llvm.minnum.f32(float, float)
declare float @llvm.maximum.f32(float, float)
declare float @llvm.minimum.f32(float, float)
declare half @llvm.maxnum.f16(half, half)
declare double @llvm.maxnum.f64(double, double)
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)