blob: 01ebe7d71428b398af5d227aa35f9719dfc7d515 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
/* TODO: Support safe bf16 fdiv lowering.
define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
%fdiv = fdiv bfloat %x, %y
ret bfloat %fdiv
}
*/
define bfloat @v_rcp_bf16(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rcp_bf16:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rcp_bf16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%fdiv = fdiv bfloat 1.0, %x
ret bfloat %fdiv
}
define bfloat @v_rcp_bf16_abs(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l|
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0|
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%fabs = call bfloat @llvm.fabs.bf16(bfloat %x)
%fdiv = fdiv bfloat 1.0, %fabs
ret bfloat %fdiv
}
define bfloat @v_rcp_bf16_afn(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%fdiv = fdiv afn bfloat 1.0, %x
ret bfloat %fdiv
}
define bfloat @v_rcp_bf16_neg(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%fdiv = fdiv bfloat -1.0, %x
ret bfloat %fdiv
}
; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat 1.0, %sqrt
ret bfloat %fdiv
}
; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_neg(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat -1.0, %sqrt
ret bfloat %fdiv
}
; TODO: Support lowering to v_rsq_bf16.
define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
; GFX1250-TRUE16-NEXT: v_nop
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
; GFX1250-FAKE16-NEXT: v_nop
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat 1.0, %sqrt
%r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0
%r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1
ret <2 x bfloat> %r2
}
; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat 1.0, %sqrt
ret bfloat %fdiv
}
; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv bfloat 1.0, %sqrt
ret bfloat %fdiv
}
; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv bfloat -1.0, %sqrt
ret bfloat %fdiv
}
define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-TRUE16-LABEL: v_rsq_v2bf16:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
; GFX1250-FAKE16-NEXT: v_nop
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
%fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt
ret <2 x bfloat> %fdiv
}
define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
; GFX1250-FAKE16-NEXT: v_nop
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
%fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt
ret <2 x bfloat> %fdiv
}